<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i7e18910</article-id>
      <article-id pub-id-type="pmid">32501278</article-id>
      <article-id pub-id-type="doi">10.2196/18910</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Reliability of Supervised Machine Learning Using Synthetic Data in Health Care: Model to Preserve Privacy for Data Sharing</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Dankar</surname>
            <given-names>Fida</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Rankin</surname>
            <given-names>Debbie</given-names>
          </name>
          <degrees>BSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Computing, Engineering and Intelligent Systems</institution>
            <institution>Ulster University</institution>
            <addr-line/>
            <addr-line>Derry~Londonderry, </addr-line>
            <country>United Kingdom</country>
            <phone>44 28 7167 5841 ext 5841</phone>
            <email>d.rankin1@ulster.ac.uk</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2110-0599</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Black</surname>
            <given-names>Michaela</given-names>
          </name>
          <degrees>BSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5140-2566</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Bond</surname>
            <given-names>Raymond</given-names>
          </name>
          <degrees>BSc, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1078-2232</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Wallace</surname>
            <given-names>Jonathan</given-names>
          </name>
          <degrees>BA, MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8415-4001</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Mulvenna</surname>
            <given-names>Maurice</given-names>
          </name>
          <degrees>BSc, MPhil, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1554-0785</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Epelde</surname>
            <given-names>Gorka</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5179-415X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Computing, Engineering and Intelligent Systems</institution>
        <institution>Ulster University</institution>
        <addr-line>Derry~Londonderry</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>School of Computing</institution>
        <institution>Ulster University</institution>
        <addr-line>Jordanstown</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Vicomtech Foundation</institution>
        <institution>Basque Research and Technology Alliance</institution>
        <addr-line>Donostia-San Sebastián</addr-line>
        <country>Spain</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Biodonostia Health Research Institute, eHealth Group</institution>
        <addr-line>Donostia-San Sebastián</addr-line>
        <country>Spain</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Debbie Rankin <email>d.rankin1@ulster.ac.uk</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>20</day>
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>7</issue>
      <elocation-id>e18910</elocation-id>
      <history>
        <date date-type="received">
          <day>26</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>4</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>24</day>
          <month>4</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>4</day>
          <month>6</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Debbie Rankin, Michaela Black, Raymond Bond, Jonathan Wallace, Maurice Mulvenna, Gorka Epelde. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 20.07.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2020/7/e18910/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The exploitation of synthetic data in health care is at an early stage. Synthetic data could unlock the potential within health care datasets that are too sensitive for release. Several synthetic data generators have been developed to date; however, studies evaluating their efficacy and generalizability are scarce.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This work sets out to understand the difference in performance of supervised machine learning models trained on synthetic data compared with those trained on real data.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A total of 19 open health datasets were selected for experimental work. Synthetic data were generated using three synthetic data generators that apply classification and regression trees, parametric, and Bayesian network approaches. Real and synthetic data were used (separately) to train five supervised machine learning models: stochastic gradient descent, decision tree, k-nearest neighbors, random forest, and support vector machine. Models were tested only on real data to determine whether a model developed by training on synthetic data can used to accurately classify new, real examples. The impact of statistical disclosure control on model performance was also assessed.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>A total of 92% of models trained on synthetic data have lower accuracy than those trained on real data. Tree-based models trained on synthetic data have deviations in accuracy from models trained on real data of 0.177 (18%) to 0.193 (19%), while other models have lower deviations of 0.058 (6%) to 0.072 (7%). The winning classifier when trained and tested on real data versus models trained on synthetic data and tested on real data is the same in 26% (5/19) of cases for classification and regression tree and parametric synthetic data and in 21% (4/19) of cases for Bayesian network-generated synthetic data. Tree-based models perform best with real data and are the winning classifier in 95% (18/19) of cases. This is not the case for models trained on synthetic data. When tree-based models are not considered, the winning classifier for real and synthetic data is matched in 74% (14/19), 53% (10/19), and 68% (13/19) of cases for classification and regression tree, parametric, and Bayesian network synthetic data, respectively. Statistical disclosure control methods did not have a notable impact on data utility.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The results of this study are promising with small decreases in accuracy observed in models trained with synthetic data compared with models trained with real data, where both are tested on real data. Such deviations are expected and manageable. Tree-based classifiers have some sensitivity to synthetic data, and the underlying cause requires further investigation. This study highlights the potential of synthetic data and the need for further evaluation of their robustness. Synthetic data must ensure individual privacy and data utility are preserved in order to instill confidence in health care departments when using such data to inform policy decision-making.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>synthetic data</kwd>
        <kwd>supervised machine learning</kwd>
        <kwd>data utility</kwd>
        <kwd>health care</kwd>
        <kwd>decision support</kwd>
        <kwd>statistical disclosure control</kwd>
        <kwd>privacy</kwd>
        <kwd>open data</kwd>
        <kwd>stochastic gradient descent</kwd>
        <kwd>decision tree</kwd>
        <kwd>k-nearest neighbors</kwd>
        <kwd>random forest</kwd>
        <kwd>support vector machine</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>National health care departments hold volumes of data on patients and the population, and this information is not being used to its full potential due to valid privacy concerns. Machine learning has the potential to improve decisions and outcomes in health care, but these improvements have yet to be fully realized. The reasons may be related to issues facing many data scientists and researchers in this area: the limited availability of or access to data or the readiness of health care institutions to share data. Privacy concerns over personal data, and in particular health care data, means that although the data exist, they are deemed too sensitive for public release [<xref ref-type="bibr" rid="ref1">1</xref>], even for research purposes.</p>
        <p>One way to overcome the issue of data availability is to use fully synthetic data as an alternative to real data. The exploitation of synthetic data in health care is at an early stage and gaining attention. Synthetic data are simulated from real data by using the underlying statistical properties of the real data to produce synthetic datasets that exhibit these same statistical properties. Synthetic data can represent the population in the original data while avoiding any divulgence of real personal, potentially confidential, and sensitive data. In the case of health-related data, this would ensure that actual patient records are not disclosed thus avoiding governance and confidentiality issues. There are three types of synthetic data: fully synthetic, partially synthetic, and hybrid synthetic. This work considers fully synthetic data that does not contain original data.</p>
        <p>Synthetic data can be used in two ways: to augment an existing dataset thus increasing its size, for times when a dataset is unbalanced due to the limited occurrence of an event or when more examples are required [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>] and to generate a fully synthetic dataset that is representative of the original dataset, for times when data are not available due to their sensitive nature [<xref ref-type="bibr" rid="ref4">4</xref>]. The latter is considered in this work as a key requirement for health care data sharing.</p>
        <p>Traditionally, data perturbation techniques such as data swapping, data masking, cell suppression, and adding noise have been applied to real data to modify and thus protect the data from disclosure prior to releasing it. However, such methods do not eliminate disclosure risk and can impact the utility of the data, particularly if multivariate relationships are not considered [<xref ref-type="bibr" rid="ref5">5</xref>]. Synthetic data was first proposed by Rubin [<xref ref-type="bibr" rid="ref6">6</xref>] and Little [<xref ref-type="bibr" rid="ref7">7</xref>]. Raghunathan et al [<xref ref-type="bibr" rid="ref8">8</xref>] implemented and extended upon this, pioneering the multiple imputation approach to synthetic data generation, exemplified in a range of studies [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. Reiter [<xref ref-type="bibr" rid="ref15">15</xref>] then introduced an alternative method of synthesizing data through a nonparametric tree–based technique that uses classification and regression trees (CART). A more recent technique proposes a Bayesian network approach for synthetic data generation [<xref ref-type="bibr" rid="ref16">16</xref>]. Synthetic data is considered a secure approach for enabling public release of sensitive data as it goes beyond traditional deidentification methods by generating a fake dataset that does not contain any of the original, identifiable information from which it was generated, while retaining the valid statistical properties of the real data. Therefore, the risk of reverse engineering or disclosure of a real person is considered to be unlikely [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>While a number of synthetic data generators have been developed, empirical evidence of their efficacy has not been fully explored. This work extends a preliminary study [<xref ref-type="bibr" rid="ref18">18</xref>] and investigates whether fully synthetic data can preserve the hidden complex patterns supervised machine learning can uncover from real data and therefore whether it can be used as a valid alternative to real data when developing eHealth apps and health care policy making solutions. This will be achieved by experimenting with a range of open health care datasets. Synthetic data will be generated using three well-known synthetic data generation techniques. Supervised machine learning algorithms will be used to validate the performance of the synthetic datasets. Statistical disclosure control (SDC) methods that can further decrease the disclosure risk associated with synthetic data will also be considered.</p>
      </sec>
      <sec>
        <title>Overview</title>
        <p>To inform the viability of the use of synthetic data as a valid and reliable alternative to real data in the health care domain, we will answer the following research questions:</p>
        <list list-type="bullet">
          <list-item>
            <p>What is the differential in performance when using synthetic data versus real data for training and testing supervised machine learning models?</p>
          </list-item>
          <list-item>
            <p>What is the variance of absolute difference of accuracies between machine learning models training on real and synthetic datasets?</p>
          </list-item>
          <list-item>
            <p>How often does the winning machine learning technique change when training using real data to training using synthetic data?</p>
          </list-item>
          <list-item>
            <p>What is the impact of SDC (ie, privacy protection) measures on the utility of synthetic data (ie, similarity to real data)?</p>
          </list-item>
        </list>
        <p>To answer these questions, 19 open health care datasets containing both categorical and numerical data were selected for experimentation [<xref ref-type="bibr" rid="ref19">19</xref>]. Synthetic datasets were generated for each dataset using three popular synthetic data generators that apply CART [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], parametric [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], and Bayesian network [<xref ref-type="bibr" rid="ref16">16</xref>] approaches to enable a robust comparison of the three synthetic data generation techniques across a broad range of data.</p>
        <p>Initially, we analyzed whether the multivariate relationships that exist in the real data were preserved in the synthetic versions of the data for data generated using each of the three synthetic data generation techniques by computing pairwise mutual information scores for each variable pair combination in each dataset [<xref ref-type="bibr" rid="ref16">16</xref>]. It is important that such relationships are retained when data are synthesized.</p>
        <p>To evaluate the utility of synthetic data for machine learning, we then investigated the performance of supervised machine learning models trained on synthetic data and tested on real data compared with models trained on real data and also tested on the real data. This allowed us to determine if a model developed using synthetic data can classify real data examples as accurately and reliably as a model developed using real data. We considered five supervised machine learning models to compare performance and determine if there were differences in robustness across the models. Standard evaluation metrics were computed for models trained on real and synthetic data, for each machine learning model, and for each dataset [<xref ref-type="bibr" rid="ref20">20</xref>]. The differences in accuracy for models trained on synthetic data versus models trained on real data were computed to analyze the extent to which synthetic data causes a degradation in model performance, if any.</p>
        <p>It is pertinent that the optimal machine learning model built using synthetic data matches the optimal machine learning model that would be selected if real data were used in the model training process. This would provide stakeholders in health care with confidence in the use of synthetic data for model development. Thus, we considered how often the best machine learning classifier built using synthetic data matches the best machine learning model built using real data.</p>
        <p>Finally, the impact of a number of SDC methods on model performance was assessed. SDC methods seek to further enhance data privacy; however, this can lead to a loss in usefulness of the data [<xref ref-type="bibr" rid="ref21">21</xref>], and we considered the extent to which performance degradation occurs as a result of SDC.</p>
        <p>This large-scale assessment of the reliability of synthetic data when used for supervised machine learning using 19 health care datasets and 3 synthetic data generation techniques provides an important contribution in relation to the trust and confidence that stakeholders in health care can have in synthetic data. We also propose a pipeline to illustrate how synthetic data can potentially fit within the health care provider context. This work demonstrates the promising performance of synthetic data while highlighting its limitations and future work directions to overcome them.</p>
      </sec>
      <sec>
        <title>Synthetic Data: Present and Future Use</title>
        <p>The validity and disclosure risk associated with synthetic data has been under investigation by the US Census Bureau since 2003 for the purpose of creating public use data from a combination of sensitive data from the Census Bureau’s Survey of Income and Program Participation, the Internal Revenue Service’s individual lifetime earnings data, and the Social Security Administration’s individual benefit data [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. The goal was to enable the release of synthesized person-level records containing personal and financial characteristics from confidential datasets while preserving privacy. Successful results have led to the release of public use synthetic data files. Researchers can have their work validated against the gold standard (real) data by the Census Bureau, thus enabling them to determine the impact of synthetic data on their exploratory analyses and model development and have confidence in their results while also allowing the Census Bureau to continuously improve their synthesis techniques. The public release of this data has provided significant benefit to the research community and general population, enabling more extensive economic policy research to be performed by groups who could not previously access useful data [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. This work led to the release of further synthetic datasets by the Census Bureau. The Synthetic Longitudinal Business Database comprises data from an annual economic census of establishments in the United States [<xref ref-type="bibr" rid="ref30">30</xref>]. This dataset provides broad access to rich data that supports the research and policy-making communities in business- and employment-related topics. OnTheMap is a tool using synthetic data to provide information on US citizens such as workforce-related maps, demographic profiles, and reports on analyses of information including the location and characteristics of workers living or working in selected areas, the distance and direction totals between residence and employment locations for workers in selected areas, and disaster event information and the impact of such events on workers and employers [<xref ref-type="bibr" rid="ref31">31</xref>]. Similarly, synthetic data has also been under investigation in the United Kingdom as a means to provide public access to rich data from UK longitudinal studies [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>] that contain highly sensitive data linking national census data to administrative data for individuals and their families.</p>
        <p>These datasets enable researchers to explore data and develop and test code and models outside the secure environment where real data reside with no restrictions while the data owners provide a mechanism where results, code, and models can be validated on behalf of researchers on the real data within the secure environment and feedback provided. This process increases research productivity while ensuring the development of robust and valid models [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>While synthetic data have been used to accelerate and democratize business and economic policy research [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref35">35</xref>], the process is not currently in use for health care research, an area that could benefit enormously. With advancements in technology, particularly machine learning and artificial intelligence (AI), the potential to develop diagnostic tools for clinicians and data driven decision-making platforms for health policy-makers is ever increasing [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. Such tools require access to health care data, for example, to train AI algorithms and produce models that can identify health conditions and health-related patterns across the population. Currently, it can take a lengthy period of time for researchers to gain access to health care data, a rich and underused resource, due to privacy concerns [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref42">42</xref>]. For example, in the case of the 40-month Meaningful Integration of Data, Analytics, and Services (MIDAS) Project [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref43">43</xref>] developing a data-driven decision-making tool for health care policy makers, it took more than 20 months to obtain access to the required data due to legal and ethical constraints. In addition, a number of important data variables could not made available, which restricted the utility of the platform under development. With the help of synthetic data, such data, with more or all variables included, could have been made available in a matter of weeks, thus providing more time for development and evaluation of the platform. The platform could then have been installed in health care sites more quickly and connected to real data for validation and comparison of performance for synthetic versus real data, enabling performance tweaks to mitigate bias introduced by synthetic data, if any. Synthetic data could also enable cross-site analytics in various health regions that would enable policy makers to connect their health spaces and potentially provide significant enhancements to cross-national health policy.</p>
        <p>The ultimate goal of this work was to further assess the validity and disclosure risk of synthetic data under the stringent conditions associated with health care data with the view to successfully developing a pipeline for use in health care that enables synthetic datasets to be released publicly to researchers, who would otherwise not be able to access the data or access it in a timely fashion, in order to accelerate research by enabling the wider research community to use the data for analysis and model development. The results of such analyses and the models and code developed can then be given to health care departments for validation on the real data and, if effective, put into use by clinicians and health policy-makers.</p>
      </sec>
      <sec>
        <title>Synthetic Data Pipeline for Health Care</title>
        <p>To understand how health care departments can benefit from synthetic data, we propose the pipeline shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. This is a proposed synthetic data-sharing pipeline provided as an illustration of how synthetic data can potentially work within a real health care setting to expedite data analytics. In future work, we plan to test this pipeline in a real setting. In this pipeline, real data reside within the national health care department infrastructure. The data cannot be shared externally due to the sensitive and private nature. Health care departments may only have a small number of data science staff with the expertise necessary to apply machine learning techniques to many of their datasets, so they cannot maximize the use of their data or discover the potential use of the data due to lack of resources. By applying a synthetic data generation technique to the real data along with SDC measures, a synthetic dataset can be produced and made available to the external research community in place of the real data. External researchers, in large numbers and with wide-ranging expertise, can potentially develop optimal machine learning models trained on the synthetic data and share the performance of the machine learning model, the model itself, and the model specification with the national health care department. The health care department can then test the machine learning model on real data, or in-house technical staff can rebuild the model according to the specification provided by researchers including the program code written by researchers, details of the machine learning algorithm to use (eg, decision tree [DT], support vector machine [SVM]), and the optimal hyperparameter settings determined during development. Using these settings, the model can be rebuilt, this time by training on the real data instead of synthetic data, to which in-house staff have access.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Proposed synthetic data sharing pipeline illustrates how synthetic data could be implemented to expedite health care data analytics.</p>
          </caption>
          <graphic xlink:href="medinform_v8i7e18910_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Dataset Selection</title>
        <p>For experimentation, 19 open health care datasets have been selected from the University of California Irvine Machine Learning Repository [<xref ref-type="bibr" rid="ref19">19</xref>]. Missing values have been removed from the datasets either by removing features with a high number of missing values or removing observations where a feature contains a missing value. The experimental datasets and their properties are summarized in <xref ref-type="table" rid="table1">Table 1</xref>. These datasets were selected to enable an analysis of synthetic data performance when applied to datasets of differing volume and data types (categorical and numerical).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Summary of experimental datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="260"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="200"/>
            <col width="0"/>
            <col width="170"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Dataset and letter designation<sup>a</sup></td>
                <td colspan="2">Attributes n</td>
                <td colspan="2">Categorical attributes n</td>
                <td colspan="2">Numerical attributes n</td>
                <td colspan="2">Classes/labels n</td>
                <td>Observations n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>A</td>
                <td>Breast Cancer Wisconsin (original)</td>
                <td colspan="2">9</td>
                <td colspan="2">0</td>
                <td colspan="2">9</td>
                <td colspan="2">2</td>
                <td colspan="2">683</td>
              </tr>
              <tr valign="top">
                <td>B</td>
                <td>Breast Cancer</td>
                <td colspan="2">9</td>
                <td colspan="2">9</td>
                <td colspan="2">0</td>
                <td colspan="2">2</td>
                <td colspan="2">277</td>
              </tr>
              <tr valign="top">
                <td>C</td>
                <td>Breast Cancer Coimbra</td>
                <td colspan="2">9</td>
                <td colspan="2">0</td>
                <td colspan="2">9</td>
                <td colspan="2">2</td>
                <td colspan="2">116</td>
              </tr>
              <tr valign="top">
                <td>D</td>
                <td>Breast Tissue</td>
                <td colspan="2">9</td>
                <td colspan="2">0</td>
                <td colspan="2">9</td>
                <td colspan="2">6</td>
                <td colspan="2">106</td>
              </tr>
              <tr valign="top">
                <td>E</td>
                <td>Chronic Kidney Disease</td>
                <td colspan="2">21</td>
                <td colspan="2">12</td>
                <td colspan="2">9</td>
                <td colspan="2">2</td>
                <td colspan="2">209</td>
              </tr>
              <tr valign="top">
                <td>F</td>
                <td>Cardiotocography (3 class)</td>
                <td colspan="2">21</td>
                <td colspan="2">0</td>
                <td colspan="2">21</td>
                <td colspan="2">3</td>
                <td colspan="2">2126</td>
              </tr>
              <tr valign="top">
                <td>G</td>
                <td>Cardiotocography (10 class)</td>
                <td colspan="2">21</td>
                <td colspan="2">0</td>
                <td colspan="2">21</td>
                <td colspan="2">10</td>
                <td colspan="2">2126</td>
              </tr>
              <tr valign="top">
                <td>H</td>
                <td>Dermatology</td>
                <td colspan="2">34</td>
                <td colspan="2">33</td>
                <td colspan="2">1</td>
                <td colspan="2">6</td>
                <td colspan="2">358</td>
              </tr>
              <tr valign="top">
                <td>I</td>
                <td>Diabetic Retinopathy</td>
                <td colspan="2">19</td>
                <td colspan="2">3</td>
                <td colspan="2">16</td>
                <td colspan="2">2</td>
                <td colspan="2">1151</td>
              </tr>
              <tr valign="top">
                <td>J</td>
                <td>Echocardiogram</td>
                <td colspan="2">10</td>
                <td colspan="2">2</td>
                <td colspan="2">8</td>
                <td colspan="2">3</td>
                <td colspan="2">106</td>
              </tr>
              <tr valign="top">
                <td>K</td>
                <td>EEG<sup>b</sup> Eye State</td>
                <td colspan="2">14</td>
                <td colspan="2">0</td>
                <td colspan="2">14</td>
                <td colspan="2">2</td>
                <td colspan="2">14980</td>
              </tr>
              <tr valign="top">
                <td>L</td>
                <td>Heart Disease</td>
                <td colspan="2">13</td>
                <td colspan="2">8</td>
                <td colspan="2">5</td>
                <td colspan="2">2</td>
                <td colspan="2">303</td>
              </tr>
              <tr valign="top">
                <td>M</td>
                <td>Lymphography</td>
                <td colspan="2">18</td>
                <td colspan="2">18</td>
                <td colspan="2">0</td>
                <td colspan="2">4</td>
                <td colspan="2">148</td>
              </tr>
              <tr valign="top">
                <td>N</td>
                <td>Postoperative Patient Data</td>
                <td colspan="2">8</td>
                <td colspan="2">8</td>
                <td colspan="2">0</td>
                <td colspan="2">3</td>
                <td colspan="2">87</td>
              </tr>
              <tr valign="top">
                <td>O</td>
                <td>Primary Tumor</td>
                <td colspan="2">15</td>
                <td colspan="2">15</td>
                <td colspan="2">0</td>
                <td colspan="2">21</td>
                <td colspan="2">336</td>
              </tr>
              <tr valign="top">
                <td>P</td>
                <td>Stroke</td>
                <td colspan="2">10</td>
                <td colspan="2">7</td>
                <td colspan="2">3</td>
                <td colspan="2">2</td>
                <td colspan="2">29072</td>
              </tr>
              <tr valign="top">
                <td>Q</td>
                <td>Thoracic Surgery</td>
                <td colspan="2">16</td>
                <td colspan="2">13</td>
                <td colspan="2">3</td>
                <td colspan="2">2</td>
                <td colspan="2">470</td>
              </tr>
              <tr valign="top">
                <td>R</td>
                <td>Thyroid Disease</td>
                <td colspan="2">22</td>
                <td colspan="2">16</td>
                <td colspan="2">6</td>
                <td colspan="2">28</td>
                <td colspan="2">5786</td>
              </tr>
              <tr valign="top">
                <td>S</td>
                <td>Thyroid Disease (New)</td>
                <td colspan="2">5</td>
                <td colspan="2">0</td>
                <td colspan="2">5</td>
                <td colspan="2">3</td>
                <td colspan="2">215</td>
              </tr>
              <tr valign="top">
                <td>—</td>
                <td>Total</td>
                <td colspan="2">283</td>
                <td colspan="2">144</td>
                <td colspan="2">139</td>
                <td colspan="2">105</td>
                <td colspan="2">58,655</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Each dataset has been encoded with a letter (column 1) and will be referenced using this letter for the remainder of the paper.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>EEG: electroencephalograph.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Generating Synthetic Data</title>
        <p>In this work, we analyzed and assessed the performance of three publicly available synthetic data generation techniques that are based on well known, seminal work in the area [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]: a parametric data synthesis technique, a nonparametric tree-based synthesis technique that uses CART [<xref ref-type="bibr" rid="ref15">15</xref>], and a synthesis technique that uses Bayesian networks [<xref ref-type="bibr" rid="ref16">16</xref>]. While other approaches exist, some are developed for specific datasets and problems (eg, SimPop simulates population survey data [<xref ref-type="bibr" rid="ref44">44</xref>], and Synthea simulates patient population and electronic health record data [<xref ref-type="bibr" rid="ref45">45</xref>]), whereas these techniques are considered to be more general. The R package Synthpop, developed by Nowak et al [<xref ref-type="bibr" rid="ref17">17</xref>], provides a publicly available implementation of the parametric- and CART-based synthetic data generators. The DataSynthesizer python implementation, developed by Ping et al [<xref ref-type="bibr" rid="ref16">16</xref>], provides a publicly available implementation of the Bayesian network-based synthetic data generator. These implementations have been used in this experimental work.</p>
        <p>Attributes were synthesized sequentially in both the parametric and CART methods. The synthetic values for the first attribute were synthesized using a random sample from the original observed data since it has no predictors from previously synthesized attributes in the dataset. When synthesizing attributes, both categorical and numerical, with the nonparametric method, the CART method was applied. CART was applied to all variables that had predictors (ie, attributes prior to them in the sequence) and drew from the conditional distributions fitted to the original data using CART models. The parametric method synthesizes attributes based on data type. Numerical attributes were synthesized using normal linear regression. Categorical attributes were synthesized using polytomous logistic regression where the attribute had more than two levels, and logistic regression was applied to synthesize binary categorical variables [<xref ref-type="bibr" rid="ref17">17</xref>]. The Bayesian network method of synthesizing data learned a differentially private Bayesian network that captured correlation structure between attributes in the real data and drew samples from this model to produce synthetic data [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      </sec>
      <sec>
        <title>Supervised Machine Learning With Real and Synthetic Data</title>
        <p>A key measure of data utility of a synthetic dataset for the purpose of machine learning is to determine how well a supervised machine learning model trained on synthetic data performs when tasked with classifying real data. This determines whether supervised machine learning models will be robust enough to classify real data examples if only synthetic data are provided for the training of these models.</p>
        <p>To evaluate whether synthetic datasets could be used as a valid alternative to real datasets in machine learning, for each of the 19 datasets (<xref ref-type="table" rid="table1">Table 1</xref>), five classification models were trained. Initially, the models were trained and tested on the real data to obtain a performance benchmark. Subsequently, a classifier was trained on each of the synthetic datasets, generated using parametric, CART and Bayesian network techniques, and then tested with the real data. Models were tested on real data only to determine whether a model developed by training on synthetic data can be put into use by health care departments and used to accurately classify new, real examples.</p>
        <p>The range of models applied to each dataset were stochastic gradient descent DT, k-nearest neighbors (KNN), random forest (RF), and SVM. This selection of algorithms was applied to determine how well each performed when trained with the real data compared with the synthetic data, with both tested on real data.</p>
        <p>The classifiers were implemented using Python’s Scikit-Learn 0.21.3 machine learning library and are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>Stochastic gradient descent classification was implemented using SGDClassifier, a simple linear classifier, with loss=“hinge,” random_state=0 and all other parameters set to their defaults</p>
          </list-item>
          <list-item>
            <p>DT classification was implemented using DecisionTreeClassifier, an optimized version of CART, with criterion=“gini,” max_depth=10, and random_state=0 and all other parameters set to their defaults</p>
          </list-item>
          <list-item>
            <p>K-nearest neighbors classification was implemented using KNeighborsClassifier with n_neighbors=10, weights=“uniform,” leaf_size=30, p=2, metric=“minkowski,” n_jobs=2 and all other parameters set to their defaults</p>
          </list-item>
          <list-item>
            <p>RF classification was implemented using RandomForestClassifier with criterion=“gini,” max_depth=10, min_samples_split=2, n_estimators=10, random_state=1 and all other parameters set to their defaults</p>
          </list-item>
          <list-item>
            <p>SVM classification was implemented using SVC with C=1.0, degree=3, kernel=“rbf,” probability=True, random_state=None and all other parameters set to their defaults</p>
          </list-item>
        </list>
        <p>For training and testing, Python’s Scikit-Learn 0.21.3 ShuffleSplit random permutation cross-validator was used with 10 splitting iterations and a train/test split of 75/25. Categorical attributes were transformed into indicator attributes using one-hot encoding.</p>
      </sec>
      <sec>
        <title>Statistical Disclosure Control</title>
        <p>Synthetic data are considered not to contain real units and therefore the risk of disclosure of a real person is considered to be unlikely [<xref ref-type="bibr" rid="ref46">46</xref>]. While unlikely, the scenario where some of the generated synthetic data are very similar to the real data resulting in potential disclosure risk must be considered, and where additional protections can be applied to synthetic data, it is recommended to do so. Additional SDC measures beyond data synthesis can be applied as a precautionary measure to add further protections to synthetic data by reducing the risk of reproducing real-person records and replicating outlier data, thus further minimizing the risk of disclosure. There are two broad categories of SDC; rules-based SDC consists of a set of fixed rules governing what data can or cannot be released (eg, a rule setting a specific minimum frequency threshold on a dataset in order for it to be released) and principles-based SDC consists of a broader assessment of risk for a dataset to determine whether it is safe for release (eg, in the case where a specific rule on thresholds may not be applicable because the data cannot be linked back to individuals or in cases where thresholds are not enough to protect individuals from reidentification [<xref ref-type="bibr" rid="ref47">47</xref>]). SDC measures can be applied, evaluated, and reparameterized as part of the penetration and reidentification testing that health care providers would apply before releasing a synthesized dataset.</p>
        <p>The following SDC methods, appropriate for rules-based SDC, have been considered and applied in experimental work to determine their effect on data utility:</p>
        <list list-type="bullet">
          <list-item>
            <p>Minimum leaf size (CART method specific): for the CART method, a minimum final leaf node size can be set to avoid the risk of final nodes containing small numbers of records, thus increasing the risk of producing real records (and thus real-person data) in the synthesized data. In SDC experiments, this is set to 10.</p>
          </list-item>
          <list-item>
            <p>Smoothing: smoothing can be applied to continuous/numerical fields in the synthesized data to reduce the risk of releasing unusual/outlier data. In SDC experiments, gaussian kernel density smoothing is applied to numerical attributes only.</p>
          </list-item>
          <list-item>
            <p>Unique removal: unique records with variable sequences that are identical to records in the real dataset can be removed. In SDC experiments, this has been applied to synthetic data.</p>
          </list-item>
        </list>
        <p>Each of these SDC techniques have been applied to the datasets generated using the CART technique, and the smoothing and unique removal techniques have been applied to datasets generated using the parametric technique. SDC methods have not been applied to data synthesized using the Bayesian network technique.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Synthetic Data Properties</title>
        <sec>
          <title>Comparison of Variable Relationships</title>
          <p>Within a dataset, relationships can exist between variables. When data are synthesized, we wish to determine whether these relationships are preserved and where they are not preserved, whether this relates to the synthesis technique or structure of the dataset. An analysis of these linear relationships was performed by computing the normalized pairwise mutual information score between each pair of attributes. This is a measure of association or similarity where a higher score indicates a greater association between two attributes. <xref rid="figure2" ref-type="fig">Figure 2</xref> provides a visual representation of the normalized pairwise mutual information scores in adjacency heatmaps for each of the 19 datasets (listed in column 1) and enables visual determination of whether the associations found in the real datasets (column 2) are similar to the associations in the synthetic datasets (columns 3-5) for each of the three synthetic data generators.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Pairwise mutual information for the real and synthetic datasets. These adjacency heat maps provide an efficient approach to visually determine whether the associations in the real datasets are similar to the associations in the corresponding synthetic datasets. Column 1 indicates the dataset, columns 2 indicates the pairwise mutual information for the real data, and columns 3-5 indicate the pairwise mutual information for synthetic datasets generated using CART, parametric and Bayesian network approaches, respectively.</p>
            </caption>
            <graphic xlink:href="medinform_v8i7e18910_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>The relationships between variables changed slightly in synthetic data generated using the CART and parametric techniques for datasets C-G, I-K, and S, with decreased correlations observed between attribute pairs. These datasets contain mainly and in some cases only numerical attributes. The relationships were largely preserved for the other datasets, which contain mainly and in some cases only categorical attributes, with the exception of dataset A, which contains only numerical attributes.</p>
          <p>The relationships between variables also changed slightly in a number of datasets synthesized using the Bayesian network technique (eg, E-G, I-L, N, P-S), with increased correlations observed between attribute pairs. The relationships were largely preserved in datasets B-D, M, and O, while a slight decrease in correlations between attribute pairs was observed for datasets A and H. In this case, the changes cannot be attributed to a particular data type.</p>
        </sec>
      </sec>
      <sec>
        <title>Supervised Machine Learning With Real and Synthetic Data</title>
        <sec>
          <title>Performance Comparison</title>
          <p>To compare the performance of each model when trained on the synthetic data and tested with the real data, a variety of evaluation metrics were used. The accuracy, precision, recall, and F1 score were computed to determine performance.</p>
          <p>The accuracy scores for five machine learning models are shown in <xref ref-type="table" rid="table2">Table 2</xref> for datasets A through S. Accuracy scores for models trained on the real data and synthetic data are shown where synthetic data is generated using CART, parametric, and Bayesian network techniques, respectively. The accuracy of the models when trained on synthetic data is lower than the accuracy when trained on real data in 92% (263/285) of cases (ie, machine learning results are less accurate for synthetic data in 92% of cases; <xref ref-type="table" rid="table3">Table 3</xref>).</p>
          <p>Although the accuracy decreases in most cases when using synthetic models, this reduction in accuracy is small. The mean absolute difference in accuracy in models trained with synthetic data across all three synthesizing techniques is lowest for SVM, SGD, and KNN models at 0.058 (6%), 0.064 (6%), and 0.072 (7%), respectively. RF and DT models have larger deviations in accuracy at 0.177 (18%) and 0.193 (19%), respectively (<xref ref-type="table" rid="table4">Table 4</xref>). This pattern is also consistent when considering results for each of the three synthetic data generators separately. These results are illustrated in the boxplots in <xref rid="figure3" ref-type="fig">Figure 3</xref>. The mean absolute difference may provide a reliable indicator of the expected decrease in accuracy in supervised machine learning models when developed using synthetic data. A small yet consistent difference in accuracy is expected and manageable between real and synthetic data.</p>
          <p>In addition to accuracy scores, we consider changes to precision, recall, and F1 scores. Precision, recall, and F1 scores decrease in almost all models and for data generated with each synthetic data technique across all 19 datasets (<xref rid="figure4" ref-type="fig">Figure 4</xref>). These decreases indicate that the models generated with synthetic data have a higher rate of false-positive and false-negative predictions than models trained with real data. Decreases in precision, recall, and F1 are larger in DT and RF models, consistent with changes in accuracy scores; however, the changes are larger than changes in accuracy for these models. The variance in precision, recall, and F1 differences are also more notable in models trained with synthetic data generated using the Bayesian network approach with less problematic decreases observed in models trained with synthetic data generated using the CART and parametric approaches.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Comparison of accuracy scores of five supervised machine learning models trained on real data and synthetic data across 19 datasets. Increase or decrease in accuracy compared with the model trained on real data shown in parentheses.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="150"/>
              <col width="180"/>
              <col width="160"/>
              <col width="150"/>
              <col width="160"/>
              <col width="170"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Dataset and training set<sup>a</sup></td>
                  <td colspan="5">Machine learning algorithm accuracy</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <break/>
                  </td>
                  <td>SGD<sup>b</sup></td>
                  <td>DT<sup>c</sup></td>
                  <td>KNN<sup>d</sup></td>
                  <td>RF<sup>e</sup></td>
                  <td>SVM<sup>f</sup></td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="2">
                    <bold>A</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.962</td>
                  <td>1.000 (W<sup>g</sup>)</td>
                  <td>0.975</td>
                  <td>0.997</td>
                  <td>0.974</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART<sup>h</sup></td>
                  <td>0.966 (+0.004)</td>
                  <td>0.950 (–0.050)</td>
                  <td>0.967 (–0.008)</td>
                  <td>0.965 (–0.032)</td>
                  <td>0.969 (W) (–0.005)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.932 (–0.030)</td>
                  <td>0.907 (–0.093)</td>
                  <td>0.931 (–0.044)</td>
                  <td>0.927 (–0.070)</td>
                  <td>0.946 (W) (–0.028)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.954 (–0.011)</td>
                  <td>0.924 (–0.076)</td>
                  <td>0.963 (–0.012)</td>
                  <td>0.947 (–0.050)</td>
                  <td>0.967 (W) (–0.007)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>B</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.668</td>
                  <td>0.931 (W)</td>
                  <td>0.758</td>
                  <td>0.924</td>
                  <td>0.83</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.652 (–0.016)</td>
                  <td>0.698 (–0.233)</td>
                  <td>0.765 (+0.007)</td>
                  <td>0.749 (–0.175)</td>
                  <td>0.784 (W) (–0.046)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.706 (+0.048)</td>
                  <td>0.700 (–0.231)</td>
                  <td>0.748 (–0.010)</td>
                  <td>0.726 (–0.198)</td>
                  <td>0.753 (W) (–0.077)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.674 (+0.006)</td>
                  <td>0.712 (–0.219)</td>
                  <td>0.744 (–0.014)</td>
                  <td>0.741 (–0.183)</td>
                  <td>0.770 (W) (–0.060)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>C</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.629</td>
                  <td>1.000 (W)</td>
                  <td>0.784</td>
                  <td>0.983</td>
                  <td>0.905</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.603 (–0.026)</td>
                  <td>0.652 (–0.348)</td>
                  <td>0.662 (–0.122)</td>
                  <td>0.676 (–0.307)</td>
                  <td>0.729 (W) (–0.176)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.707 (+0.078)</td>
                  <td>0.702 (–0.298)</td>
                  <td>0.652 (–0.132)</td>
                  <td>0.709 (W) (–0.272)</td>
                  <td>0.700 (–0.205)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.662 (+0.033)</td>
                  <td>0.709 (–0.291)</td>
                  <td>0.664 (–0.144)</td>
                  <td>0.747 (W) (–0.236)</td>
                  <td>0.710 (–0.195)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>D</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.632</td>
                  <td>1.000 (W)</td>
                  <td>0.726</td>
                  <td>0.962</td>
                  <td>0.66</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.502 (–0.130)</td>
                  <td>0.664 (–0.336)</td>
                  <td>0.542 (–0.184)</td>
                  <td>0.706 (W) (–0.254)</td>
                  <td>0.536 (–0.124)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.472 (–0.160)</td>
                  <td>0.666 (W) (–0.334)</td>
                  <td>0.508 (–0.218)</td>
                  <td>0.628 (–0.334)</td>
                  <td>0.545 (–0.115)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.438 (–0.194)</td>
                  <td>0.592 (–0.408)</td>
                  <td>0.511 (–0.215)</td>
                  <td>0.649 (W) (–0.313)</td>
                  <td>0.557 (–0.103)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>E</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.995</td>
                  <td>1.000 (W)</td>
                  <td>0.981</td>
                  <td>1.000 (W)</td>
                  <td>0.995</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.972 (–0.023)</td>
                  <td>0.944 (–0.056)</td>
                  <td>0.967 (–0.014)</td>
                  <td>0.995 (W) (–0.005)</td>
                  <td>0.994 (–0.001)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.964 (–0.031)</td>
                  <td>0.981 (–0.019)</td>
                  <td>0.965 (–0.016)</td>
                  <td>0.988 (W) (–0.012)</td>
                  <td>0.988 (W) (–0.007)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.986 (–0.009)</td>
                  <td>0.957 (–0.043)</td>
                  <td>0.974 (–0.007)</td>
                  <td>0.992 (–0.008)</td>
                  <td>0.993 (W) (–0.002)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>F</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.89</td>
                  <td>0.985 (W)</td>
                  <td>0.912</td>
                  <td>0.982</td>
                  <td>0.913</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.869 (–0.021)</td>
                  <td>0.922 (W) (–0.063)</td>
                  <td>0.883 (–0.029)</td>
                  <td>0.921 (–0.061)</td>
                  <td>0.889 (–0.024)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.873 (–0.017)</td>
                  <td>0.907 (–0.078)</td>
                  <td>0.886 (–0.026)</td>
                  <td>0.914 (W) (–0.068)</td>
                  <td>0.894 (–0.019)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.880 (–0.010)</td>
                  <td>0.918 (–0.067)</td>
                  <td>0.885 (–0.027)</td>
                  <td>0.924 (W) (–0.058)</td>
                  <td>0.893 (–0.020)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>G</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.746</td>
                  <td>0.959</td>
                  <td>0.78</td>
                  <td>0.971 (W)</td>
                  <td>0.82</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.667 (–0.079)</td>
                  <td>0.848 (W) (–0.111)</td>
                  <td>0.678 (–0.102)</td>
                  <td>0.841 (–0.070)</td>
                  <td>0.748 (–0.072)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.669 (–0.077)</td>
                  <td>0.805 (W) (–0.154)</td>
                  <td>0.676 (–0.104)</td>
                  <td>0.801 (–0.107)</td>
                  <td>0.737 (–0.083)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.676 (–0.070)</td>
                  <td>0.835 (W) (–0.124)</td>
                  <td>0.676 (–0.104)</td>
                  <td>0.822 (–0.149)</td>
                  <td>0.739 (–0.081)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>H</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>1.000 (W)</td>
                  <td>0.997</td>
                  <td>0.98</td>
                  <td>0.994</td>
                  <td>0.992</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.940 (–0.060)</td>
                  <td>0.941 (–0.056)</td>
                  <td>0.891 (–0.089)</td>
                  <td>0.958 (W) (–0.036)</td>
                  <td>0.955 (–0.037)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.935 (–0.065)</td>
                  <td>0.951 (–0.046)</td>
                  <td>0.898 (–0.082)</td>
                  <td>0.959 (W) (–0.135)</td>
                  <td>0.959 (W) (–0.032)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.940 (–0.060)</td>
                  <td>0.952 (–0.045)</td>
                  <td>0.899 (–0.081)</td>
                  <td>0.955 (–0.139)</td>
                  <td>0.959 (W) (–0.032)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>I</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.706</td>
                  <td>0.845</td>
                  <td>0.711</td>
                  <td>0.896 (W)</td>
                  <td>0.676</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.594 (–0.112)</td>
                  <td>0.643 (–0.202)</td>
                  <td>0.634 (–0.077)</td>
                  <td>0.671 (W) (–0.225)</td>
                  <td>0.609 (–0.067)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.570 (–0.136)</td>
                  <td>0.638 (–0.207)</td>
                  <td>0.624 (–0.087)</td>
                  <td>0.663 (W) (–0.233)</td>
                  <td>0.608 (–0.068)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.609 (–0.097)</td>
                  <td>0.648 (–0.197)</td>
                  <td>0.629 (–0.082)</td>
                  <td>0.667 (W) (–0.229)</td>
                  <td>0.622 (–0.054)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>J</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.453</td>
                  <td>0.981 (W)</td>
                  <td>0.642</td>
                  <td>0.981 (W)</td>
                  <td>0.651</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.526 (+0.073)</td>
                  <td>0.655 (W) (–0.326)</td>
                  <td>0.579 (–0.063)</td>
                  <td>0.649 (–0.332)</td>
                  <td>0.551 (–0.100)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.555 (+0.102)</td>
                  <td>0.689 (W) (–0.292)</td>
                  <td>0.606 (–0.036)</td>
                  <td>0.628 (–0.354)</td>
                  <td>0.549 (–0.102)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.545 (+0.092)</td>
                  <td>0.585 (–0.396)</td>
                  <td>0.585 (–0.057)</td>
                  <td>0.602 (W) (–0.379)</td>
                  <td>0.551 (–0.100)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>K</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.551</td>
                  <td>0.845</td>
                  <td>0.864</td>
                  <td>0.885 (W)</td>
                  <td>0.551</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.510 (–0.041)</td>
                  <td>0.531 (W) (–0.314)</td>
                  <td>0.531 (W) (–0.333)</td>
                  <td>0.512 (–0.373)</td>
                  <td>0.531 (W) (–0.020)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.514 (–0.037)</td>
                  <td>0.545 (W) (–0.300)</td>
                  <td>0.510 (–0.354)</td>
                  <td>0.519 (–0.366)</td>
                  <td>0.531 (–0.020)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.490 (–0.061)</td>
                  <td>0.538 (W) (–0.307)</td>
                  <td>0.510 (–0.354)</td>
                  <td>0.531 (–0.354)</td>
                  <td>0.510 (–0.041)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>L</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.851</td>
                  <td>1.000 (W)</td>
                  <td>0.861</td>
                  <td>0.977</td>
                  <td>0.865</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.791 (–0.060)</td>
                  <td>0.781 (–0.219)</td>
                  <td>0.758 (–0.103)</td>
                  <td>0.803 (W) (–0.174)</td>
                  <td>0.785 (–0.080)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.822 (W) (–0.029)</td>
                  <td>0.758 (–0.242)</td>
                  <td>0.786 (–0.075)</td>
                  <td>0.809 (–0.168)</td>
                  <td>0.793 (–0.072)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.785 (–0.066)</td>
                  <td>0.738 (–0.262)</td>
                  <td>0.818 (–0.043)</td>
                  <td>0.799 (–0.178)</td>
                  <td>0.834 (W) (–0.031)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>M</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.899</td>
                  <td>1.000 (W)</td>
                  <td>0.838</td>
                  <td>0.986</td>
                  <td>0.939</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.726 (–0.173)</td>
                  <td>0.762 (–0.238)</td>
                  <td>0.762 (–0.076)</td>
                  <td>0.780 (–0.206)</td>
                  <td>0.782 (W) (–0.157)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.739 (–0.160)</td>
                  <td>0.765 (–0.235)</td>
                  <td>0.757 (–0.081)</td>
                  <td>0.772 (–0.214)</td>
                  <td>0.796 (W) (–0.143)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.681 (–0.218)</td>
                  <td>0.662 (–0.338)</td>
                  <td>0.703 (–0.135)</td>
                  <td>0.746 (–0.240)</td>
                  <td>0.780 (W) (–0.159)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>N</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.713</td>
                  <td>0.908 (W)</td>
                  <td>0.713</td>
                  <td>0.908 (W)</td>
                  <td>0.713</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.706 (–0.007)</td>
                  <td>0.667 (–0.241)</td>
                  <td>0.715 (+0.002)</td>
                  <td>0.680 (–0.228)</td>
                  <td>0.720(W) (+0.007)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.644 (–0.067)</td>
                  <td>0.614 (–0.294)</td>
                  <td>0.706 (–0.007)</td>
                  <td>0.646 (–0.262)</td>
                  <td>0.708 (W) (–0.005)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.559 (–0.154)</td>
                  <td>0.591 (–0.317)</td>
                  <td>0.706 (W) (–0.007)</td>
                  <td>0.630 (–0.278)</td>
                  <td>0.694 (–0.019)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>O</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.449</td>
                  <td>0.732</td>
                  <td>0.458</td>
                  <td>0.762 (W)</td>
                  <td>0.56</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.338 (–0.111)</td>
                  <td>0.401 (–0.331)</td>
                  <td>0.411 (–0.047)</td>
                  <td>0.410 (–0.352)</td>
                  <td>0.425 (W) (–0.135)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.317 (–0.192)</td>
                  <td>0.377 (–0.355)</td>
                  <td>0.413 (–0.045)</td>
                  <td>0.397 (–0.365)</td>
                  <td>0.433 (W) (–0.127)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.293 (–0.156)</td>
                  <td>0.336 (–0.396)</td>
                  <td>0.375 (–0.083)</td>
                  <td>0.361 (–0.401)</td>
                  <td>0.419 (W) (–0.141)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>P</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.981</td>
                  <td>0.985 (W)</td>
                  <td>0.981</td>
                  <td>0.982</td>
                  <td>0.981</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.981 (W) (0.000)</td>
                  <td>0.977 (–0.008)</td>
                  <td>0.981 (W) (0.000)</td>
                  <td>0.981 (W) (–0.001)</td>
                  <td>0.981 (W) (0.000)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.981 (W) (0.000)</td>
                  <td>0.976 (–0.009)</td>
                  <td>0.981 (W) (0.000)</td>
                  <td>0.981 (W) (–0.001)</td>
                  <td>0.981 (W) (0.000)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.981 (W) (0.000)</td>
                  <td>0.977 (–0.008)</td>
                  <td>0.981 (W) (0.000)</td>
                  <td>0.981 (W) (–0.001)</td>
                  <td>0.981 (W) (0.000)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>Q</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.84</td>
                  <td>0.932 (W)</td>
                  <td>0.853</td>
                  <td>0.928</td>
                  <td>0.851</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.834 (–0.006)</td>
                  <td>0.795 (–0.137)</td>
                  <td>0.850 (–0.003)</td>
                  <td>0.835 (–0.093)</td>
                  <td>0.851 (W) (0.000)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.798 (–0.042)</td>
                  <td>0.811 (–0.121)</td>
                  <td>0.848 (–0.005)</td>
                  <td>0.838 (–0.090)</td>
                  <td>0.849 (W) (–0.002)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.823 (–0.017)</td>
                  <td>0.794 (–0.138)</td>
                  <td>0.846 (–0.007)</td>
                  <td>0.837 (–0.091)</td>
                  <td>0.851 (W) (0.000)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>R</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.755</td>
                  <td>0.989 (W)</td>
                  <td>0.795</td>
                  <td>0.961</td>
                  <td>0.738</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.742 (–0.013)</td>
                  <td>0.819 (–0.170)</td>
                  <td>0.761 (–0.034)</td>
                  <td>0.825 (W) (–0.136)</td>
                  <td>0.733 (–0.005)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.749 (–0.006)</td>
                  <td>0.786 (–0.203)</td>
                  <td>0.764 (–0.031)</td>
                  <td>0.798 (W) (–0.163)</td>
                  <td>0.734 (–0.004)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.748 (–0.007)</td>
                  <td>0.835 (W) (–0.154)</td>
                  <td>0.762 (–0.033)</td>
                  <td>0.832 (–0.129)</td>
                  <td>0.734 (–0.004)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>S</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Real</td>
                  <td>0.958</td>
                  <td>1.000 (W)</td>
                  <td>0.921</td>
                  <td>1.000 (W)</td>
                  <td>0.953</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CART</td>
                  <td>0.903 (–0.055)</td>
                  <td>0.901 (–0.099)</td>
                  <td>0.899 (–0.022)</td>
                  <td>0.935 (W) (–0.065)</td>
                  <td>0.913 (–0.040)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Parametric</td>
                  <td>0.890 (–0.068)</td>
                  <td>0.913 (–0.087)</td>
                  <td>0.912 (–0.009)</td>
                  <td>0.930 (W) (–0.060)</td>
                  <td>0.926 (–0.027)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bayesian</td>
                  <td>0.905 (–0.053)</td>
                  <td>0.914 (–0.086)</td>
                  <td>0.908 (–0.013)</td>
                  <td>0.936 (W) (–0.064)</td>
                  <td>0.930 (–0.023)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>Training dataset name indicates if real or synthetic data were used to train the model and for synthetic datasets which synthetic data generator was used (ie, CART, parametric, or Bayesian).</p>
              </fn>
              <fn id="table2fn2">
                <p><sup>b</sup>SGD: stochastic gradient descent.</p>
              </fn>
              <fn id="table2fn3">
                <p><sup>c</sup>DT: decision tree.</p>
              </fn>
              <fn id="table2fn4">
                <p><sup>d</sup>KNN: k-nearest neighbors.</p>
              </fn>
              <fn id="table2fn5">
                <p><sup>e</sup>RF: random forest.</p>
              </fn>
              <fn id="table2fn6">
                <p><sup>f</sup>SVM: support vector machine.</p>
              </fn>
              <fn id="table2fn7">
                <p><sup>g</sup>(W) highlights the winning classifier for each training set.</p>
              </fn>
              <fn id="table2fn8">
                <p><sup>h</sup>CART: classification and regression trees.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Changes in accuracy for each machine learning model and synthetic data type (19 datasets and 3 synthetic data generators considered providing 57 synthetic datasets to analyze).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="150"/>
              <col width="150"/>
              <col width="130"/>
              <col width="150"/>
              <col width="140"/>
              <col width="140"/>
              <col width="0"/>
              <col width="140"/>
              <thead>
                <tr valign="top">
                  <td>Change in accuracy</td>
                  <td colspan="5">Machine learning algorithm</td>
                  <td colspan="2">
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SGD<sup>a</sup> (n=57), n (%)</td>
                  <td>DT<sup>b</sup> (n=57), n (%)</td>
                  <td>KNN<sup>c</sup> (n=57), n (%)</td>
                  <td>RF<sup>d</sup> (n=57), n (%)</td>
                  <td colspan="2">SVM<sup>e</sup> (n=57), n (%)</td>
                  <td>Total (n=285), n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Increase</td>
                  <td>8 (14)</td>
                  <td>0 (0)</td>
                  <td>2 (4)</td>
                  <td>0 (0)</td>
                  <td colspan="2">1 (2)</td>
                  <td>11 (4)</td>
                </tr>
                <tr valign="top">
                  <td>Same</td>
                  <td>3 (5)</td>
                  <td>0 (0)</td>
                  <td>3 (5)</td>
                  <td>0 (0)</td>
                  <td colspan="2">5 (9)</td>
                  <td>11 (4)</td>
                </tr>
                <tr valign="top">
                  <td>Decrease</td>
                  <td>46 (81)</td>
                  <td>57 (100)</td>
                  <td>52 (91)</td>
                  <td>57 (100)</td>
                  <td colspan="2">51 (89)</td>
                  <td>263 (92)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table3fn1">
                <p><sup>a</sup>SGD: stochastic gradient descent.</p>
              </fn>
              <fn id="table3fn2">
                <p><sup>b</sup>DT: decision tree.</p>
              </fn>
              <fn id="table3fn3">
                <p><sup>c</sup>KNN: k-nearest neighbors.</p>
              </fn>
              <fn id="table3fn4">
                <p><sup>d</sup>RF: random forest.</p>
              </fn>
              <fn id="table3fn5">
                <p><sup>e</sup>SVM: support vector machine.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Mean absolute difference in accuracy for each machine learning model and synthetic data type.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="220"/>
              <col width="140"/>
              <col width="170"/>
              <col width="150"/>
              <col width="170"/>
              <col width="150"/>
              <thead>
                <tr valign="top">
                  <td>Synthetic dataset</td>
                  <td colspan="5">Mean absolute difference in accuracy per machine learning algorithm</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SGD<sup>a</sup>, n (%)</td>
                  <td>DT<sup>b</sup>, n (%)</td>
                  <td>KNN<sup>c</sup>, n (%)</td>
                  <td>RF<sup>d</sup>, n (%)</td>
                  <td>SVM<sup>e</sup>, n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>CART<sup>f</sup></td>
                  <td>0.053 (5.3)</td>
                  <td>0.186 (18.6)</td>
                  <td>0.069 (6.9)</td>
                  <td>0.164 (16.4)</td>
                  <td>0.058 (5.8)</td>
                </tr>
                <tr valign="top">
                  <td>Parametric</td>
                  <td>0.071 (7.1)</td>
                  <td>0.189 (18.9)</td>
                  <td>0.072 (7.2)</td>
                  <td>0.183 (18.3)</td>
                  <td>0.060 (6.0)</td>
                </tr>
                <tr valign="top">
                  <td>Bayesian network</td>
                  <td>0.069 (6.9)</td>
                  <td>0.204 (20.4)</td>
                  <td>0.075 (7.5)</td>
                  <td>0.183 (18.3)</td>
                  <td>0.056 (5.6)</td>
                </tr>
                <tr valign="top">
                  <td>ALL</td>
                  <td>0.064 (6.4)</td>
                  <td>0.193 (19.3)</td>
                  <td>0.072 (7.2)</td>
                  <td>0.177 (17.7)</td>
                  <td>0.058 (5.8)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>SGD: stochastic gradient descent.</p>
              </fn>
              <fn id="table4fn2">
                <p><sup>b</sup>DT: decision tree.</p>
              </fn>
              <fn id="table4fn3">
                <p><sup>c</sup>KNN: k-nearest neighbors.</p>
              </fn>
              <fn id="table4fn4">
                <p><sup>d</sup>RF: random forest.</p>
              </fn>
              <fn id="table4fn5">
                <p><sup>e</sup>SVM: support vector machine.</p>
              </fn>
              <fn id="table4fn6">
                <p><sup>f</sup>CART: classification and regression trees.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Overall change in accuracy for each machine learning model when trained on synthetic data across 19 datasets and 3 synthetic data approaches where classification and regression tree (a), parametric (b), Bayesian network (c), and all approaches combined (d), compared with models trained using real data.</p>
            </caption>
            <graphic xlink:href="medinform_v8i7e18910_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Overall change in precision (a-c), recall (d-f), and F1 (g-i) scores for each machine learning model when trained on synthetic data (from 19 datasets) generated using classification and regression tree (a, d, g), parametric (b, e, h) and Bayesian network (c, f, i) approaches, compared with models trained using real data.</p>
            </caption>
            <graphic xlink:href="medinform_v8i7e18910_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Winning Classifier</title>
          <p>In the pipeline described previously, health care departments may wish to release synthetic versions of data to the wider research community for the development of an optimal machine learning model—for example, they may wish to determine the best classifier to use on their real data by making use of the wider range of expertise and scale the external research community can provide. The researchers would be expected to train and test various models and hyperparameters to find the best solution. The researchers would then return a model and/or model specification to the health departments, enabling them to test the model on real data and/or enabling a health department’s technical staff to recreate a version of the model, this time trained on the real data to which in-house staff have access. Health departments would have the expectation that this would be the same model determined if real data had been used to develop the best model (ie, it would have been the “winning” model when trained on either synthetic or real data).</p>
          <p>We compared the winning classifier when trained and tested on real data with the winning classifier when trained on synthetic data and tested on real data. <xref ref-type="table" rid="table2">Table 2</xref> lists the winning classifier (marked as W on each row) for each dataset when trained with real and synthetic data and when tested on the real data.</p>
          <p>The winning classifier when trained on real data matches the winning classifier when trained on synthetic data in only 26% (5/19) of cases for synthetic data generated using the CART and parametric methods, and in just 21% (4/19) of cases on data synthesized using the Bayesian network technique (<xref ref-type="table" rid="table5">Table 5</xref>).</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Number of instances where the winning classifier trained on synthetic data matches the winning classifier trained on real data across 19 datasets.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="180"/>
              <col width="270"/>
              <col width="220"/>
              <col width="330"/>
              <thead>
                <tr valign="top">
                  <td>Synthetic dataset</td>
                  <td colspan="3">Winning classifier matches for real versus synthetic</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>5 classifiers</td>
                  <td>4 classifiers (DT<sup>a</sup> removed)</td>
                  <td>3 classifiers (DT and RF<sup>b</sup> removed)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>CART<sup>c</sup></td>
                  <td>5/19 (26.3)</td>
                  <td>10/19 (52.6)</td>
                  <td>14/19 (73.7)</td>
                </tr>
                <tr valign="top">
                  <td>Parametric</td>
                  <td>5/19 (26.3)</td>
                  <td>10/19 (52.6)</td>
                  <td>10/19 (52.6)</td>
                </tr>
                <tr valign="top">
                  <td>Bayesian network</td>
                  <td>4/19 (21.1)</td>
                  <td>10/19 (52.6)</td>
                  <td>13/19 (68.4)</td>
                </tr>
                <tr valign="top">
                  <td>All</td>
                  <td>14/57</td>
                  <td>30/57</td>
                  <td>37/57</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>DT: decision tree.</p>
              </fn>
              <fn id="table5fn2">
                <p><sup>b</sup>RF: random forest.</p>
              </fn>
              <fn id="table5fn3">
                <p><sup>c</sup>CART: classification and regression trees.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>The DT classifier is most often the winning classifier, in 14/19 datasets, when real data are used to train and test the model, but DT is not the best classifier on synthetic data, winning in only 11/57 cases (<xref ref-type="table" rid="table2">Table 2</xref>). Tree-based methods (DT and RF) are the winning classifier on real data in 18/19 cases (95%). If we remove DTs from this analysis, the cases where the winning classifier when trained on synthetic data matches the winning classifier when trained on real data almost doubles, increasing to 53% (10/19) of cases for synthetic data generated using each of the three synthesizing techniques (<xref ref-type="table" rid="table5">Table 5</xref>).</p>
          <p>With DTs removed, RF models are now the most frequent winners (18/19) when real data are used to train and test the model. In this case, RF models produce the winning classifier in 32/57 cases (<xref ref-type="table" rid="table2">Table 2</xref>). If we further remove RFs from this analysis and do not consider tree-based classifiers, cases where the winning classifier when trained on synthetic data matches the winning classifier when trained on real data increases from 53% to 74% (14/19) and 68% (13/19) for data synthesized using CART and Bayesian network techniques, respectively, and remains unchanged for data generated using the parametric technique (<xref ref-type="table" rid="table5">Table 5</xref>).</p>
          <p>A chi-square test is applied with the following null and alternative hypotheses:</p>
          <list list-type="bullet">
            <list-item>
              <p>H0: the number of winning classifier matches is equal across all sets of classifiers.</p>
            </list-item>
            <list-item>
              <p>H1: the number of winning classifier matches increases when DT and RF classifiers are removed.</p>
            </list-item>
          </list>
          <p>The level of significance adopted for hypothesis testing is .05 for all tests performed.</p>
          <p>The null hypothesis is rejected when the tree-based models (DTs and RFs) are removed (ie, from 5 to 3 classifiers) for data synthesized using the CART and Bayesian network methods (<xref ref-type="table" rid="table6">Table 6</xref>). Therefore, a significant difference in the matching winning classifiers is observed when tree-based classifiers are removed for these two synthesizing techniques. The null hypothesis could not be rejected in all other cases.</p>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>Results of chi-square analysis of the difference in matching winning classifiers for models trained on real versus synthetic data.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="280"/>
              <col width="170"/>
              <col width="300"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Synthetic dataset</td>
                  <td colspan="3">Winning classifier matches for real versus synthetic</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>5 classifiers</td>
                  <td>4 classifiers (DT<sup>a</sup> removed)</td>
                  <td>3 classifiers (DT and RF<sup>b</sup> removed)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>CART<sup>c</sup></td>
                  <td>0.1843</td>
                  <td>0.3130</td>
                  <td>0.0094</td>
                </tr>
                <tr valign="top">
                  <td>Parametric</td>
                  <td>0.1843</td>
                  <td>1.0000</td>
                  <td>0.1843</td>
                </tr>
                <tr valign="top">
                  <td>Bayesian network</td>
                  <td>0.0927</td>
                  <td>0.0927</td>
                  <td>0.0091</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table6fn1">
                <p><sup>a</sup>DT: decision tree.</p>
              </fn>
              <fn id="table6fn2">
                <p><sup>b</sup>RF: random forest.</p>
              </fn>
              <fn id="table6fn3">
                <p><sup>c</sup>CART: classification ans regression trees.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Impact of Statistical Disclosure Control</title>
          <p>The impact of SDC methods on data utility is considered across all datasets. <xref ref-type="table" rid="table7">Table 7</xref> illustrates the effect on model accuracy of applying smoothing (numeric attributes only), removal of unique records, and limiting the minimum leaf size (CART models only) to all synthetic datasets where each method is applicable.</p>
          <table-wrap position="float" id="table7">
            <label>Table 7</label>
            <caption>
              <p>Changes in accuracy for each machine learning model and with statistical disclosure control applied.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="170"/>
              <col width="140"/>
              <col width="140"/>
              <col width="110"/>
              <col width="120"/>
              <col width="130"/>
              <col width="160"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Change in accuracy</td>
                  <td colspan="6">Machine learning algorithm change in accuracy with SDC<sup>a</sup></td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <break/>
                  </td>
                  <td>SGD<sup>b</sup></td>
                  <td>DT<sup>c</sup></td>
                  <td>KNN<sup>d</sup></td>
                  <td>RF<sup>e</sup></td>
                  <td>SVM<sup>f</sup></td>
                  <td>Total</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="2">
                    <bold>Smoothing (n=150)</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Increase</td>
                  <td>4/30 (13.3)</td>
                  <td>0/30 (0.0)</td>
                  <td>1/30 (3.3)</td>
                  <td>0/30 (0.0)</td>
                  <td>2/30 (6.7)</td>
                  <td>7/150 (4.7)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Same</td>
                  <td>2/30 (6.7)</td>
                  <td>0/30 (0.0)</td>
                  <td>2/30 (6.7)</td>
                  <td>0/30 (0.0)</td>
                  <td>3/30 (10.0)</td>
                  <td>7/150 (4.7)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Decrease</td>
                  <td>24/30 (80.0)</td>
                  <td>30/30 (100.0)</td>
                  <td>27/30 (90.0)</td>
                  <td>30/30 (100.0)</td>
                  <td>25/30 (83.3)</td>
                  <td>136/150 (90.7)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>Unique removal (n=190)</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Increase</td>
                  <td>4/38 (10.5)</td>
                  <td>0/38 (0.0)</td>
                  <td>1/38 (2.6)</td>
                  <td>0/38 (0.0)</td>
                  <td>2/38 (5.3)</td>
                  <td>7/190 (3.7)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Same</td>
                  <td>2/38 (5.3)</td>
                  <td>0/38 (0.0)</td>
                  <td>2/38 (5.3)</td>
                  <td>0/38 (0.0)</td>
                  <td>4/38 (10.5)</td>
                  <td>8/190 (4.2)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Decrease</td>
                  <td>32/38 (84.2)</td>
                  <td>38/38 (100.0)</td>
                  <td>35/38 (92.1)</td>
                  <td>38/38 (100.0)</td>
                  <td>32/38 (84.2)</td>
                  <td>175/190 (92.1)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>Minimum leaf size (n=95)</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Increase</td>
                  <td>2/19 (10.5)</td>
                  <td>0/19 (0.0)</td>
                  <td>0/19 (0.0)</td>
                  <td>0/19 (0.0)</td>
                  <td>1/19 (5.3)</td>
                  <td>3/95 (3.2)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Same</td>
                  <td>1/19 (5.3)</td>
                  <td>0/19 (0.0)</td>
                  <td>1/19 (5.3)</td>
                  <td>0/19 (0.0)</td>
                  <td>2/19 (10.5)</td>
                  <td>4/95 (4.2)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Decrease</td>
                  <td>16/19 (84.2)</td>
                  <td>19/19 (100.0)</td>
                  <td>18/19 (94.7)</td>
                  <td>19/19 (100.0)</td>
                  <td>16/19 (84.2)</td>
                  <td>88/95 (92.6)</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>All (n=435)</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Increase</td>
                  <td>10/87 (11.5)</td>
                  <td>0/87 (0.0)</td>
                  <td>2/87 (2.3)</td>
                  <td>0/87 (0.0)</td>
                  <td>5/87 (5.7)</td>
                  <td>17/435 (3.9)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Same</td>
                  <td>5/87 (5.7)</td>
                  <td>0/87 (0.0)</td>
                  <td>5/87 (5.7)</td>
                  <td>0/87 (0.0)</td>
                  <td>9/87 (10.3)</td>
                  <td>19/435 (4.4)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Decrease</td>
                  <td>72/87 (82.8)</td>
                  <td>87/87 (100.0)</td>
                  <td>80/87 (92.0)</td>
                  <td>87/87 (100.0)</td>
                  <td>73/87 (83.9)</td>
                  <td>399/435 (91.7)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table7fn1">
                <p><sup>a</sup>SDC: statistical disclosure control. Each of the 3 types of SDC applied (smoothing, unique removal and minimum leaf size for CART). SDC applied to parametric and CART methods only. Smoothing applied to datasets with numeric attributes only. Minimum leaf size for CART is applicable to CART only.</p>
              </fn>
              <fn id="table7fn2">
                <p><sup>b</sup>SGD: stochastic gradient descent.</p>
              </fn>
              <fn id="table7fn3">
                <p><sup>c</sup>DT: decision tree.</p>
              </fn>
              <fn id="table7fn4">
                <p><sup>d</sup>KNN: k-nearest neighbors.</p>
              </fn>
              <fn id="table7fn5">
                <p><sup>e</sup>RF: random forest.</p>
              </fn>
              <fn id="table7fn6">
                <p><sup>f</sup>SVM: support vector machine.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>In most cases, the machine learning model accuracy decreases when SDC measures are applied to the synthetic data used to train the models. Decreases in accuracy are observed in all DT and RF models and in 83% (72/87), 92% (80/87), and 84% (73/87) of SGD, KNN, and SVM models, respectively. In a small number of cases across SGD, KNN, and SVM models trained on synthetic data with SDC measures applied, no change or a slight increase in accuracy compared with models trained on real data with no SDC measures applied was observed.</p>
          <p>The mean absolute difference in accuracy when SDC measures are applied to the training data (compared with machine learning models trained on real data) is small across all machine learning models and for all SDC techniques (<xref ref-type="table" rid="table8">Table 8</xref>). DT and RF models have the largest difference in accuracy, consistent with earlier results of these models trained on synthetic data with no SDC measures applied. The accuracy decreases are consistent across each SDC measure with no SDC measure affecting data utility more notably than any other. These results are also illustrated in the boxplots in <xref rid="figure5" ref-type="fig">Figure 5</xref>. Precision, recall, and F1 scores are also consistent with earlier results when no SDC measures are applied. We therefore consider that the SDC techniques investigated do not have a notable impact on data utility beyond what the standard synthesizers have.</p>
          <table-wrap position="float" id="table8">
            <label>Table 8</label>
            <caption>
              <p>Mean absolute difference in accuracy for each machine learning model and statistical disclosure control type.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="230"/>
              <col width="140"/>
              <col width="150"/>
              <col width="160"/>
              <col width="170"/>
              <col width="150"/>
              <thead>
                <tr valign="top">
                  <td>SDC<sup>a</sup> applied to synthetic dataset</td>
                  <td colspan="5">Average change in accuracy per machine learning algorithm</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SGD<sup>b</sup></td>
                  <td>DT<sup>c</sup></td>
                  <td>KNN<sup>d</sup></td>
                  <td>RF<sup>e</sup></td>
                  <td>SVM<sup>f</sup></td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Smoothing</td>
                  <td>0.059 (5.9)</td>
                  <td>0.190 (19.0)</td>
                  <td>0.094 (9.4)</td>
                  <td>0.177 (17.7)</td>
                  <td>0.060 (6.0)</td>
                </tr>
                <tr valign="top">
                  <td>Unique removal</td>
                  <td>0.052 (5.2)</td>
                  <td>0.206 (20.6)</td>
                  <td>0.072 (7.2)</td>
                  <td>0.184 (18.4)</td>
                  <td>0.056 (5.6)</td>
                </tr>
                <tr valign="top">
                  <td>Minimum leaf size</td>
                  <td>0.061 (6.1)</td>
                  <td>0.200 (20.0)</td>
                  <td>0.068 (6.8)</td>
                  <td>0.180 (18.0)</td>
                  <td>0.053 (5.3)</td>
                </tr>
                <tr valign="top">
                  <td>All</td>
                  <td>0.056 (5.6)</td>
                  <td>0.199 (19.9)</td>
                  <td>0.078 (7.8)</td>
                  <td>0.180 (18.0)</td>
                  <td>0.057 (5.7)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table8fn1">
                <p><sup>a</sup>SDC: statistical disclosure control.</p>
              </fn>
              <fn id="table8fn2">
                <p><sup>b</sup>SGD: stochastic gradient descent.</p>
              </fn>
              <fn id="table8fn3">
                <p><sup>c</sup>DT: decision tree.</p>
              </fn>
              <fn id="table8fn4">
                <p><sup>d</sup>KNN: k-nearest neighbors.</p>
              </fn>
              <fn id="table8fn5">
                <p><sup>e</sup>RF: random forest.</p>
              </fn>
              <fn id="table8fn6">
                <p><sup>f</sup>SVM: support vector machine.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Overall change in accuracy for each machine learning model when trained on synthetic data across 19 datasets and 2 synthetic data approaches (classification and regression tree [CART] and parametric) and with statistical disclosure control measures applied where smoothing (a; numeric attributes only), unique removal (b), minimum leaf size constrained (c; for CART synthesizer only), and all approaches combined (d), compared with models trained using real data.</p>
            </caption>
            <graphic xlink:href="medinform_v8i7e18910_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>We also compare the winning classifier when trained on real data with the winning classifier when trained on synthetic data with SDC applied (<xref ref-type="table" rid="table9">Table 9</xref>). The winning classifier when trained on synthetic data with SDC applied matches the winning classifier when trained on synthetic data in only 25% (22/87) of cases, consistent with earlier results when SDC measures are not applied. Similar results are observed when each SDC measure is considered individually with the winning classifier matching in models trained with real data compared with models trained using synthetic data with SDC measures of smoothing, unique removal, and minimum leaf size in 27% (8/30), 24% (9/38), and 26% (5/19) of cases, respectively.</p>
          <p>Consistent with results in the previous section where SDC measures were not applied, removing tree-based classifiers (DT and RF) from the analysis increases the matches in winning classifiers trained on real compared with synthetic data by 13.3, 36.8, and 36.9 percentage points for each of the SDC measures of smoothing, unique removal, and minimum leaf size, respectively. Overall, an increase of 28.7 percentage points is observed for all SDC measures when tree-based classifiers are removed.</p>
          <table-wrap position="float" id="table9">
            <label>Table 9</label>
            <caption>
              <p>Number of instances where the winning classifier trained on synthetic data with statistical disclosure control applied matches the winning classifier trained on real data across 19 datasets.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="200"/>
              <col width="200"/>
              <col width="260"/>
              <col width="340"/>
              <thead>
                <tr valign="top">
                  <td>Synthetic dataset</td>
                  <td colspan="3">Winning classifier matches for real versus synthetic</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>5 classifiers, n (%)</td>
                  <td>4 classifiers (DT<sup>a</sup> removed), n (%)</td>
                  <td>3 classifiers (DT and RF<sup>b</sup> removed), n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Smoothing</td>
                  <td>8/30 (27)</td>
                  <td>14/30 (47)</td>
                  <td>12/30 (40)</td>
                </tr>
                <tr valign="top">
                  <td>Unique removal</td>
                  <td>9/38 (24)</td>
                  <td>15/38 (40)</td>
                  <td>23/38 (61)</td>
                </tr>
                <tr valign="top">
                  <td>Minimum leaf size</td>
                  <td>5/19 (26)</td>
                  <td>7/19 (37)</td>
                  <td>12/19 (63)</td>
                </tr>
                <tr valign="top">
                  <td>All</td>
                  <td>22/87 (25)</td>
                  <td>36/87 (41)</td>
                  <td>47/87 (54)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table9fn1">
                <p><sup>a</sup>DT: decision tree.</p>
              </fn>
              <fn id="table9fn2">
                <p><sup>b</sup>RF: random forest.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>A chi-square test is applied with the following null and alternative hypotheses:</p>
          <list list-type="bullet">
            <list-item>
              <p>H0: the number of winning classifier matches is equal across all sets of classifiers where SDC measures are applied.</p>
            </list-item>
            <list-item>
              <p>H1: the number of winning classifier matches increases when DT and RF classifiers are removed where SDC measures are applied.</p>
            </list-item>
          </list>
          <p>The level of significance adopted for hypothesis testing is .05 for all tests performed (α=.05).</p>
          <p>The null hypothesis is rejected when the tree-based models (DTs and RFs) are removed (ie, from 5 to 3 classifiers) for data synthesized with the SDC measure of unique removal applied (<xref ref-type="table" rid="table1">Table 10</xref>). Therefore, a significant difference in the matching winning classifiers is observed when tree-based classifiers are removed for this SDC measure. The null hypothesis could not be rejected in all other cases.</p>
          <table-wrap position="float" id="table10">
            <label>Table 10</label>
            <caption>
              <p>Results of chi-square analysis of the difference in matching winning classifiers for models trained on real versus synthetic data with statistical disclosure control applied.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="200"/>
              <col width="240"/>
              <col width="220"/>
              <col width="340"/>
              <thead>
                <tr valign="top">
                  <td>Synthetic dataset</td>
                  <td colspan="3"><italic>P</italic> values</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>5 classifiers</td>
                  <td>4 classifiers (DT<sup>a</sup> removed)</td>
                  <td>3 classifiers (DT and RF<sup>b</sup> removed)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Smoothing</td>
                  <td>.18</td>
                  <td>.79</td>
                  <td>.41</td>
                </tr>
                <tr valign="top">
                  <td>Unique removal</td>
                  <td>.22</td>
                  <td>.11</td>
                  <td>.003</td>
                </tr>
                <tr valign="top">
                  <td>Minimum leaf size</td>
                  <td>.73</td>
                  <td>.19</td>
                  <td>.05</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table10fn1">
                <p><sup>a</sup>DT: decision tree.</p>
              </fn>
              <fn id="table10fn2">
                <p><sup>b</sup>RF: random forest.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The need for synthetic data, particularly in the health care domain, is gaining increasing attention as privacy protection mechanisms are increasingly failing to protect modern data. Due to valid privacy concerns, it is often difficult or impossible to release real health care data thus impeding critical machine learning research that can make use of this data to drive improved patient outcomes and health policy decision-making. Synthetic data has the potential to overcome data availability issues, providing a valid alternative to real data. A small number of synthetic data generators have been proposed in the literature; however, evidence of their efficacy across a large number of datasets and for use in machine learning is thin on the ground.</p>
        <p>This work has explored the use of fully synthetic data across 19 health care datasets. Three well-known synthetic data generators have been considered where data is generated using CART, parametric, and Bayesian network techniques. A number of research questions have been answered.</p>
        <sec>
          <title>What Is the Differential in Performance When Using Synthetic Data Versus Real Data for Training and Testing Supervised Machine Learning Models?</title>
          <p>Compared with models trained and tested on real data, almost all machine learning models have a slightly lower accuracy when trained on synthetic data and tested on real data across all synthesizers and for all machine learning models analyzed; however, the average decrease in accuracy was small in all cases. Although still small, DT and RF models had a larger decrease and variance in accuracy than SGD, KNN, and SVM models. In addition to accuracy, an analysis of precision, recall, and F1 scores also showed decreases in scores in models trained with synthetic data, with Bayesian network-generated data resulting in more variance than data generated using CART and parametric techniques.</p>
        </sec>
        <sec>
          <title>What Is the Variance of Absolute Difference of Accuracies Between Machine Learning Models Training on Real and Synthetic Datasets?</title>
          <p>The mean absolute difference was consistently small across all models and synthetic datasets suggesting that these values could provide a reliable indicator of the expected decrease in accuracy in supervised machine learning models when developed using synthetic data. Health care departments could expect a manageable small yet consistent decrease in accuracy between real and synthetic data.</p>
        </sec>
        <sec>
          <title>How Often Does the Winning Machine Learning Technique Change When Training Using Real Data to Training Using Synthetic Data?</title>
          <p>The winning classifier when trained on synthetic data matched the winning classifier when trained on synthetic data in only 26% of cases for synthetic data generated using the CART and parametric methods and in just 21% of cases on data synthesized using the Bayesian network technique across the five machine learning models considered (SGD, DT, KNN, RF, and SVM). Tree-based methods were typically the winning classifier for models trained on real data; however, this was often not the case for models trained on synthetic data. When tree-based models were not considered, the winning classifier when trained on real data matched the winning classifier when trained on synthetic data in 74%, 53%, and 68% of cases for synthetic data generated using the CART, parametric, and Bayesian network approaches, respectively. It would appear that tree-based classifiers have some sensitivity to synthetic data, and the underlying cause requires further investigation.</p>
        </sec>
        <sec>
          <title>What Is the Impact of Statistical Disclosure Control (ie, Privacy Protection) Measures on the Utility of Synthetic Data (ie, Similarity to Real Data)?</title>
          <p>The average change in accuracy when SDC measures are applied to the training data was small across all machine learning models and for all SDC techniques. Again, tree-based models produced the largest decrease in accuracy across all SDC techniques. This is attributed to the synthetic data generation method and not the SDC measures, in line with previous results where SDC measures were not applied. We therefore conclude that the SDC techniques considered do not have a notable impact on data utility beyond what the synthetic data generation methods alone produce.</p>
        </sec>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This work has considered the impact of synthetic data on data utility when the data are used to train supervised machine learning algorithms. Further investigation with a broader range of machine learning algorithms, supervised and unsupervised, and including hyperparameter optimization is required. Such studies should cover an even larger range of datasets including, if possible, real health care department case studies.</p>
        <p>Disclosure risk must also be explored in more detail. The impact of SDC measures on data utility has been considered in this work. Disclosure risk must also be measured across synthetic datasets, and a comparison of the data utility and disclosure risk trade-off should be performed.</p>
      </sec>
      <sec>
        <title>Policy and Practice Implications</title>
        <p>A wealth of rich health care data exists with the potential to provide new insights for the prevention of diseases, development of personalized medicine, and support of healthy life across the population. These data are held by health care data gatekeepers (eg, national health care departments) and are generally prevented from release, even for research purposes, due to justifiable privacy concerns around the protection of personal data, ethics, and in guaranteeing citizens’ fundamental rights and freedoms.</p>
        <p>Data sharing and data use demand careful governance, with legislation such as General Data Protection Regulation and the EU-US Privacy Shield placing increasingly stringent guidelines on data management. Data gatekeepers must manage myriad issues in relation to the nature of the data (eg, categories of sensitive data) and descriptions of the technical characteristics of processed data, as well as sharing and management of the data (eg, fair acquisition, data processing and data retention policies, legal basis for information processing, appropriate security measures) and the configuration of information systems that store and process the data.</p>
        <p>From a health care perspective, a range of technical solutions using state-of-the-art machine learning could be developed using health care data with the potential to derive knowledge that can inform and enhance health care policy decision making and risk stratification [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. Such tools can have a positive impact on health policy and practice, meeting the aims of national health departments, for example, as stated by the Department of Health Permanent Secretary in Northern Ireland, Richard Pengelly, in support of the MIDAS project, “the Department seeks to improve the health and social wellbeing of the people of NI, reduce health inequalities, and to assure the provision of appropriate health and social care services in clinical settings and in the community.”</p>
        <p>Accessing health care data to develop such tools is complex, involving a lengthy legal and ethical process, and in some cases access is impossible. Synthetic data can potentially overcome the barriers to accessing data and the need for compliance with data protection legislation as they infringe no privacy or confidentiality while remaining durable, reusable, shareable, clean, and potentially reliable as highlighted by Floridi [<xref ref-type="bibr" rid="ref49">49</xref>], thus accelerating the development of machine learning to inform health care policy. Synthetic data also provide the opportunity to democratize the application of machine learning to health data for the benefit of patients and citizens enabling a larger community to leverage the power of machine learning in health care.</p>
        <p>There is an increasing need for the development and evaluation of a robust and trustworthy synthetic data generator. Policy makers and clinicians who base decisions on models developed with synthetic datasets must be able to do so with the assurance that any knowledge elicited is very likely to be reflected in the real data. Using synthetic datasets to facilitate machine learning without disclosing sensitive data has the potential to revolutionize health care research and policy making in an impactful way by unlocking key research data in a secure way that could drive improvements in population health and well-being much more quickly than is currently observed.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This work considers the efficacy of synthetic data for training supervised machine learning models for use by health care departments. The results are promising with small decreases in accuracy observed in models trained with synthetic data compared with those trained using real data. This work will be further extended to assist in the development of standard baselines for health care departments when using synthetic data (eg, an expected and acceptable decrease in accuracy) and synthetic data generators that can be trusted to produce the same winning model as that which would be produced by real data.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CART</term>
          <def>
            <p>classification and regression tree</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DT</term>
          <def>
            <p>decision tree</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">KNN</term>
          <def>
            <p>k-nearest neighbors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">MIDAS</term>
          <def>
            <p>Meaningful Integration of Data, Analytics, and Services</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">SDC</term>
          <def>
            <p>statistical disclosure control</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SGD</term>
          <def>
            <p>stochastic gradient descent</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The MIDAS Consortium gratefully acknowledges the support to this project from the European Union research fund Big Data Supporting Public Health Policies under grant agreement No. 727721 (H2020-SC1-2016-CNECT SC1-PM-18-2016). We also acknowledge the support of the eHealth &#38; Data Analytics Dementia Pathfinder Programme and Health and Social Care Board eHealth Directorate for this work under award ER/DARUG/09/18/10S.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>DR was mainly responsible for the paper. She designed and performed the experimental work and drafted the manuscript. All other authors gave feedback on various aspects of the paper including experimental design, machine learning, and policy and practice implications and revised the manuscript. All authors read and approved the final manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rumbold</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pierscionek</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Contextual anonymization for secondary use of big data in biomedical research: proposal for an anonymization matrix</article-title>
          <source>JMIR Med Inform</source>
          <year>2018</year>
          <month>11</month>
          <day>22</day>
          <volume>6</volume>
          <issue>4</issue>
          <fpage>e47</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2018/4/e47/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/medinform.7096</pub-id>
          <pub-id pub-id-type="medline">30467101</pub-id>
          <pub-id pub-id-type="pii">v6i4e47</pub-id>
          <pub-id pub-id-type="pmcid">PMC6284146</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sward</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>A roadmap for optimizing asthma care management via computational approaches</article-title>
          <source>JMIR Med Inform</source>
          <year>2017</year>
          <month>09</month>
          <day>26</day>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>e32</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://medinform.jmir.org/2017/3/e32/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/medinform.8076</pub-id>
          <pub-id pub-id-type="medline">28951380</pub-id>
          <pub-id pub-id-type="pii">v5i3e32</pub-id>
          <pub-id pub-id-type="pmcid">PMC5635229</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dankar</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Madathil</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Dankar</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Boughorbel</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Privacy-preserving analysis of distributed biomedical data: designing efficient and secure multiparty computations using distributed statistical learning theory</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>04</month>
          <day>29</day>
          <volume>7</volume>
          <issue>2</issue>
          <fpage>e12702</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/2/e12702/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/12702</pub-id>
          <pub-id pub-id-type="medline">31033449</pub-id>
          <pub-id pub-id-type="pii">v7i2e12702</pub-id>
          <pub-id pub-id-type="pmcid">PMC6658266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Suominen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hanlen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ferraro</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Benchmarking clinical speech recognition and information extraction: new data, methods, and evaluations</article-title>
          <source>JMIR Med Inform</source>
          <year>2015</year>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>e19</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://medinform.jmir.org/2015/2/e19"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/medinform.4321</pub-id>
          <pub-id pub-id-type="medline">25917752</pub-id>
          <pub-id pub-id-type="pii">v3i2e19</pub-id>
          <pub-id pub-id-type="pmcid">PMC4427705</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>New approaches to data dissemination: a glimpse into the future (?)</article-title>
          <source>CHANCE</source>
          <year>2004</year>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>11</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1080/09332480.2004.10554907</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Statistical disclosure limitation</article-title>
          <source>J Off Stat</source>
          <year>1993</year>
          <volume>9</volume>
          <issue>2</issue>
          <fpage>461</fpage>
          <lpage>468</lpage>
          <pub-id pub-id-type="doi">10.1002/9781118445112.stat00072</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Little</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Statistical analysis of masked data</article-title>
          <source>J Off Stat</source>
          <year>1993</year>
          <volume>9</volume>
          <fpage>407</fpage>
          <lpage>426</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raghunathan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Multiple imputation for statistical disclosure limitation</article-title>
          <source>J Off Stat</source>
          <year>2003</year>
          <volume>19</volume>
          <fpage>1</fpage>
          <lpage>16</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Simultaneous use of multiple imputation for missing data and disclosure limitation</article-title>
          <source>Survey Methodol</source>
          <year>2004</year>
          <fpage>235</fpage>
          <lpage>242</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://stat.duke.edu/~jerry/Papers/sm04.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Releasing multiply imputed, synthetic public use microdata: an illustration and empirical study</article-title>
          <source>J Royal Statistical Soc A</source>
          <year>2005</year>
          <month>01</month>
          <volume>168</volume>
          <issue>1</issue>
          <fpage>185</fpage>
          <lpage>205</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1467-985x.2004.00343.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Significance tests for multi-component estimands from multiply imputed, synthetic microdata</article-title>
          <source>J Stat Plan Infer</source>
          <year>2005</year>
          <month>5</month>
          <volume>131</volume>
          <issue>2</issue>
          <fpage>365</fpage>
          <lpage>377</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jspi.2004.02.003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Using multiple imputation to integrate and disseminate confidential microdata</article-title>
          <source>Int Stat Rev</source>
          <year>2009</year>
          <volume>77</volume>
          <issue>2</issue>
          <fpage>179</fpage>
          <lpage>195</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1751-5823.2009.00083.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Raghunathan</surname>
              <given-names>TE</given-names>
            </name>
          </person-group>
          <article-title>The multiple adaptations of multiple imputation</article-title>
          <source>J Am Stat Assoc</source>
          <year>2007</year>
          <month>12</month>
          <volume>102</volume>
          <issue>480</issue>
          <fpage>1462</fpage>
          <lpage>1471</lpage>
          <pub-id pub-id-type="doi">10.1198/016214507000000932</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Drechsler</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Releasing multiply-imputed synthetic data generated in two stages to protect confidentiality</article-title>
          <source>Stat Sinica</source>
          <year>2007</year>
          <volume>20</volume>
          <fpage>405</fpage>
          <lpage>422</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://stat.duke.edu/~jerry/Papers/ss10.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Using CART to generate partially synthetic public use microdata</article-title>
          <source>J Off Stat</source>
          <year>2005</year>
          <volume>21</volume>
          <fpage>441</fpage>
          <lpage>462</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.scb.se/contentassets/ca21efb41fee47d293bbee5bf7be7fb3/using-cart-to-generate-partially-synthetic-public-use-microdata.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ping</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Stoyanovich</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Howe</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>DataSynthesizer: privacy-preserving synthetic datasets</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the 29th International Conference on Scientific and Statistical Database Management</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Chicago</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3085504.3091117</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>synthpop: bespoke creation of synthetic data in R</article-title>
          <source>J Stat Soft</source>
          <year>2016</year>
          <volume>74</volume>
          <issue>11</issue>
          <pub-id pub-id-type="doi">10.18637/jss.v074.i11</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heyburn</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bond</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Black</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mulvenna</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rankin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cleland</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Machine learning using synthetic and real data: similarity of evaluation metrics for different healthcare datasets and for different algorithms</article-title>
          <year>2018</year>
          <conf-name>Proceedings of the 13th International FLINS Conference</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Belfast</conf-loc>
          <pub-id pub-id-type="doi">10.1142/9789813273238_0160</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dua</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Graff</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <source>UCI machine learning repository</source>
          <year>2019</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://archive.ics.uci.edu/ml">http://archive.ics.uci.edu/ml</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dalianis</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Evaluation metrics and evaluation</article-title>
          <source>Clinical Text Mining</source>
          <year>2018</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>45</fpage>
          <lpage>53</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Torra</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Disclosure risk assessment in statistical data protection</article-title>
          <source>J Comput Appl Math</source>
          <year>2004</year>
          <month>03</month>
          <volume>164-165</volume>
          <fpage>285</fpage>
          <lpage>293</lpage>
          <pub-id pub-id-type="doi">10.1016/s0377-0427(03)00643-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abowd</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stinson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Benedetto</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>Final Report to the Social Security Administration on the SIPP/SSA/IRS Public Use File Project</source>
          <year>2006</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ecommons.cornell.edu/bitstream/handle/1813/43929/SSAfinal.pdf?sequence=3&#38;isAllowed=y">https://ecommons.cornell.edu/bitstream/handle/1813/43929/SSAfinal.pdf?sequence=3&#38;isAllowed=y</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benedetto</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Stinson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Abowd</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>The Creation and Use of the SIPP Synthetic Beta</source>
          <year>2013</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ecommons.cornell.edu/bitstream/handle/1813/43924/SSBdescribe_nontechnical.pdf?sequence=3&#38;isAllowed=y">https://ecommons.cornell.edu/bitstream/handle/1813/43924/SSBdescribe_nontechnical.pdf?sequence=3&#38;isAllowed=y</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dushi</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Munnell</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Sanzenbacher</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Webb</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Do households increase their savings when the kids leave home?</article-title>
          <source>SSRN Journal</source>
          <year>2015</year>
          <pub-id pub-id-type="doi">10.2139/ssrn.2669704</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chenevert</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>Changing levels of spousal education and labor force supply</source>
          <year>2012</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.census.gov/content/dam/Census/library/working-papers/2012/demo/SIPP-WP-263.pdf">https://www.census.gov/content/dam/Census/library/working-papers/2012/demo/SIPP-WP-263.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Downs</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Sandler</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sienkiewicz</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>The parental gender earnings gap in the United States</source>
          <year>2017</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www2.census.gov/ces/wp/2017/CES-WP-17-68.pdf">https://www2.census.gov/ces/wp/2017/CES-WP-17-68.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benedetto</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gathright</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Stinson</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>The earnings impact of graduating from college during a recession</source>
          <year>2010</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www2.vrdc.cornell.edu/news/wp-content/papercite-data/pdf/benedettogathrightstinson-11301.pdf">https://www2.vrdc.cornell.edu/news/wp-content/papercite-data/pdf/benedettogathrightstinson-11301.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carr</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Wiemers</surname>
              <given-names>EE</given-names>
            </name>
          </person-group>
          <article-title>New evidence on earnings volatility in survey and administrative data</article-title>
          <source>AEA Papers Proc</source>
          <year>2018</year>
          <month>05</month>
          <day>01</day>
          <volume>108</volume>
          <fpage>287</fpage>
          <lpage>291</lpage>
          <pub-id pub-id-type="doi">10.1257/pandp.20181050</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Greenstone</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Do credit market shocks affect the real economy? Quasi-experimental evidence from the great recession and "normal" economic times</article-title>
          <source>Am Econ J Econ Policy</source>
          <year>2020</year>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>200</fpage>
          <lpage>225</lpage>
          <pub-id pub-id-type="doi">10.3386/w20704</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kinney</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reznek</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Miranda</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jarmin</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Abowd</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Towards unrestricted public use business microdata: the synthetic longitudinal business database</article-title>
          <source>SSRN J</source>
          <year>2011</year>
          <volume>79</volume>
          <issue>3</issue>
          <fpage>362</fpage>
          <lpage>384</lpage>
          <pub-id pub-id-type="doi">10.2139/ssrn.1759422</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Machanavajjhala</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kifer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Abowd</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gehrke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vilhuber</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Privacy: theory meets practice on the map</article-title>
          <year>2008</year>
          <conf-name>Proceedings of the IEEE International Conference on Data Engineering</conf-name>
          <conf-date>2008</conf-date>
          <conf-loc>Cancun</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icde.2008.4497436</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hattersley</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cresser</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>Longitudinal Study 1971-1991: history, organisation and quality of data</source>
          <year>1995</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://census.ukdataservice.ac.uk/media/51156/1971_defs.pdf">https://census.ukdataservice.ac.uk/media/51156/1971_defs.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boyle</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Feijten</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Hattersley</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Nolan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Cohort profile: the Scottish Longitudinal Study (SLS)</article-title>
          <source>Int J Epidemiol</source>
          <year>2009</year>
          <month>04</month>
          <volume>38</volume>
          <issue>2</issue>
          <fpage>385</fpage>
          <lpage>392</lpage>
          <pub-id pub-id-type="doi">10.1093/ije/dyn087</pub-id>
          <pub-id pub-id-type="medline">18492728</pub-id>
          <pub-id pub-id-type="pii">dyn087</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Reilly</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rosato</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Catney</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Johnston</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Brolly</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Cohort description: the Northern Ireland Longitudinal Study (NILS)</article-title>
          <source>Int J Epidemiol</source>
          <year>2012</year>
          <month>06</month>
          <volume>41</volume>
          <issue>3</issue>
          <fpage>634</fpage>
          <lpage>641</lpage>
          <pub-id pub-id-type="doi">10.1093/ije/dyq271</pub-id>
          <pub-id pub-id-type="medline">21296852</pub-id>
          <pub-id pub-id-type="pii">dyq271</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miranda</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vilhuber</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <source>Looking back on three years of Synthetic LBD Beta</source>
          <year>2014</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://digitalcommons.ilr.cornell.edu/cgi/viewcontent.cgi?article=1013&#38;context=ldi">https://digitalcommons.ilr.cornell.edu/cgi/viewcontent.cgi?article=1013&#38;context=ldi</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Black</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rankin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bond</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mulvenna</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cleland</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Carlin</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fischaber</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Epelde</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Nikolic</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pajula</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Connolly</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Meaningful integration of data, analytics and services of computer-based medical systems: the MIDAS touch</article-title>
          <year>2019</year>
          <conf-name>Proceedings of IEEE 32nd International Symposium on Computer-Based Medical Systems (CBMS)</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Cordoba</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cbms.2019.00031</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davenport</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kalakota</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The potential for artificial intelligence in healthcare</article-title>
          <source>Future Healthc J</source>
          <year>2019</year>
          <month>06</month>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>94</fpage>
          <lpage>98</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31363513"/>
          </comment>
          <pub-id pub-id-type="doi">10.7861/futurehosp.6-2-94</pub-id>
          <pub-id pub-id-type="medline">31363513</pub-id>
          <pub-id pub-id-type="pii">futurehealth</pub-id>
          <pub-id pub-id-type="pmcid">PMC6616181</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Piai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Claps</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Bigger data for better healthcare</source>
          <year>2013</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/bigger-data-better-healthcare-idc-insights-white-paper.pdf">https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/bigger-data-better-healthcare-idc-insights-white-paper.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rabesandratana</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>European data law is impeding studies on diabetes and Alzheimer's, researchers warn</source>
          <year>2019</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sciencemag.org/news/2019/11/european-data-law-impeding-studies-diabetes-and-alzheimer-s-researchers-warn">https://www.sciencemag.org/news/2019/11/european-data-law-impeding-studies-diabetes-and-alzheimer-s-researchers-warn</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lugg-Widger</surname>
              <given-names>FV</given-names>
            </name>
            <name name-style="western">
              <surname>Angel</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cannings-John</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hood</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hughes</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Robling</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Challenges in accessing routinely collected data from multiple providers in the UK for primary studies: managing the morass</article-title>
          <source>IJPDS</source>
          <year>2018</year>
          <month>09</month>
          <day>12</day>
          <volume>3</volume>
          <issue>3</issue>
          <pub-id pub-id-type="doi">10.23889/ijpds.v3i3.432</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lane</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schur</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Balancing access to health data and privacy: a review of the issues and approaches for the future</article-title>
          <source>Health Serv Res</source>
          <year>2010</year>
          <month>10</month>
          <volume>45</volume>
          <issue>5 Pt 2</issue>
          <fpage>1456</fpage>
          <lpage>1467</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21054366"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/j.1475-6773.2010.01141.x</pub-id>
          <pub-id pub-id-type="medline">21054366</pub-id>
          <pub-id pub-id-type="pmcid">PMC2965886</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Saria</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ohno-Machado</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Escobar</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Big data in health care: using analytics to identify and manage high-risk and high-cost patients</article-title>
          <source>Health Aff (Millwood)</source>
          <year>2014</year>
          <month>07</month>
          <volume>33</volume>
          <issue>7</issue>
          <fpage>1123</fpage>
          <lpage>1131</lpage>
          <pub-id pub-id-type="doi">10.1377/hlthaff.2014.0041</pub-id>
          <pub-id pub-id-type="medline">25006137</pub-id>
          <pub-id pub-id-type="pii">33/7/1123</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <source>MIDAS: meaningful integration of data, analytics and services</source>
          <year>2020</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.midasproject.eu/">http://www.midasproject.eu/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Meindl</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kowarik</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dupriez</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Simulation of synthetic complex data: the R package simpop</article-title>
          <source>J Stat Soft</source>
          <year>2017</year>
          <volume>79</volume>
          <issue>10</issue>
          <pub-id pub-id-type="doi">10.18637/jss.v079.i10</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Walonoski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kramer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nichols</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Quina</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Moesel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Duffett</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dube</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gallagher</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McLachlan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Synthea: an approach, method, and software mechanism for generating synthetic patients and the synthetic electronic health care record</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2017</year>
          <month>08</month>
          <day>30</day>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>230</fpage>
          <lpage>238</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx079</pub-id>
          <pub-id pub-id-type="medline">29025144</pub-id>
          <pub-id pub-id-type="pii">4098271</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elliot</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Final report on the disclosure risk associated with the synthetic data produced by the SYLLS team</source>
          <year>2015</year>
          <access-date>2020-06-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cmi.manchester.ac.uk/research/publications/reports/">https://www.cmi.manchester.ac.uk/research/publications/reports/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ritchie</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Elliot</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Principles- versus rules-based output statistical disclosure control in remote access environments</article-title>
          <source>IASSIST Q</source>
          <year>2015</year>
          <month>12</month>
          <day>11</day>
          <volume>39</volume>
          <issue>2</issue>
          <fpage>5</fpage>
          <pub-id pub-id-type="doi">10.29173/iq778</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiner Benaim</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Almog</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gorelik</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hochberg</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Nassar</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mashiach</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Khamaisi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lurie</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Azzam</surname>
              <given-names>ZS</given-names>
            </name>
            <name name-style="western">
              <surname>Khoury</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kurnik</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Beyar</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Analyzing medical research results based on synthetic data and their relation to real data results: systematic comparison from five observational studies</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>02</month>
          <day>20</day>
          <volume>8</volume>
          <issue>2</issue>
          <fpage>e16492</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/2/e16492/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/16492</pub-id>
          <pub-id pub-id-type="medline">32130148</pub-id>
          <pub-id pub-id-type="pii">v8i2e16492</pub-id>
          <pub-id pub-id-type="pmcid">PMC7059086</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Floridi</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>What the near future of artificial intelligence could be</article-title>
          <source>Philos Technol</source>
          <year>2019</year>
          <month>3</month>
          <day>19</day>
          <volume>32</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1007/s13347-019-00345-y</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
