<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn></journal-meta><article-meta><article-id pub-id-type="publisher-id">55118</article-id><article-id pub-id-type="doi">10.2196/55118</article-id><title-group><article-title>Comparison of Synthetic Data Generation Techniques for Control Group Survival Data in Oncology Clinical Trials: Simulation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Akiya</surname><given-names>Ippei</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ishihara</surname><given-names>Takuma</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yamamoto</surname><given-names>Keiichi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Biometrics, ICON Clinical Research GK</institution>, <addr-line>Tokyo</addr-line>, <country>Japan</country></aff><aff id="aff2"><institution>Innovative and Clinical Research Promotion Center, Gifu University Hospital</institution>, <addr-line>Gifu</addr-line>, <country>Japan</country></aff><aff id="aff3"><institution>Division of Data Science, Center for Industrial Research and Innovation, Translational Research Institute for Medical Innovation, Osaka Dental University</institution>, <addr-line>Osaka</addr-line>, <country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hu</surname><given-names>Danqing</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Song</surname><given-names>Jiangdian</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Ippei Akiya, MSc<email>ippei.akiya@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>18</day><month>6</month><year>2024</year></pub-date><volume>12</volume><elocation-id>e55118</elocation-id><history><date date-type="received"><day>03</day><month>12</month><year>2023</year></date><date date-type="rev-recd"><day>06</day><month>04</month><year>2024</year></date><date date-type="accepted"><day>08</day><month>05</month><year>2024</year></date></history><copyright-statement>&#x00A9; Ippei Akiya, Takuma Ishihara, Keiichi Yamamoto. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 18.6.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2024/1/e55118"/><abstract><sec><title>Background</title><p>Synthetic patient data (SPD) generation for survival analysis in oncology trials holds significant potential for accelerating clinical development. Various machine learning methods, including classification and regression trees (CART), random forest (RF), Bayesian network (BN), and conditional tabular generative adversarial network (CTGAN), have been used for this purpose, but their performance in reflecting actual patient survival data remains under investigation.</p></sec><sec><title>Objective</title><p>The aim of this study was to determine the most suitable SPD generation method for oncology trials, specifically focusing on both progression-free survival (PFS) and overall survival (OS), which are the primary evaluation end points in oncology trials. To achieve this goal, we conducted a comparative simulation of 4 generation methods, including CART, RF, BN, and the CTGAN, and the performance of each method was evaluated.</p></sec><sec sec-type="methods"><title>Methods</title><p>Using multiple clinical trial data sets, 1000 data sets were generated by using each method for each clinical trial data set and evaluated as follows: (1) median survival time (MST) of PFS and OS; (2) hazard ratio distance (HRD), which indicates the similarity between the actual survival function and a synthetic survival function; and (3) visual analysis of Kaplan-Meier (KM) plots. Each method&#x2019;s ability to mimic the statistical properties of real patient data was evaluated from these multiple angles.</p></sec><sec sec-type="results"><title>Results</title><p>In most simulation cases, CART demonstrated the high percentages of MSTs for synthetic data falling within the 95% CI range of the MST of the actual data. These percentages ranged from 88.8% to 98.0% for PFS and from 60.8% to 96.1% for OS. In the evaluation of HRD, CART revealed that HRD values were concentrated at approximately 0.9. Conversely, for the other methods, no consistent trend was observed for either PFS or OS. CART demonstrated better similarity than RF, in that CART caused overfitting and RF (a kind of ensemble learning approach) prevented it. In SPD generation, the statistical properties close to the actual data should be the focus, not a well-generalized prediction model. Both the BN and CTGAN methods cannot accurately reflect the statistical properties of the actual data because small data sets are not suitable.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>As a method for generating SPD for survival data from small data sets, such as clinical trial data, CART demonstrated to be the most effective method compared to RF, BN, and CTGAN. Additionally, it is possible to improve CART-based generation methods by incorporating feature engineering and other methods in future work.</p></sec></abstract><kwd-group><kwd>oncology clinical trial</kwd><kwd>survival analysis</kwd><kwd>synthetic patient data</kwd><kwd>machine learning</kwd><kwd>SPD</kwd><kwd>simulation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>When submitting an application for the approval of a new pharmaceutical product to health authorities, it is imperative to demonstrate its efficacy and safety through multiple clinical trials. However, 86% of these trials encounter difficulties meeting the targeted number of subjects within the designated recruitment period, often leading to extensions of the trial duration or completion of the trial without reaching the target number of subjects [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. The challenge of patient recruitment not only delays the submission of regulatory applications but also hinders the timely provision of effective treatment to patients, which consequently contributes to increased development costs and the escalation of drug prices and potentially exacerbates the strain on health care financing.</p><p>In recent years, the use of real-world data (RWD) has emerged as a potential solution for addressing these issues. The Food and Drug Administration has also released draft guidelines [<xref ref-type="bibr" rid="ref4">4</xref>], garnering attention on the application of RWD as an external control arm in clinical trials [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Furthermore, it has been reported that it is possible to optimize eligibility using RWD and machine learning, thereby increasing the number of eligible subjects that can be included [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>In addition to these approaches, we hypothesize that it is possible to generate synthetic patient data (SPD) from control arm data in past clinical trials and use it to establish a control arm for a new clinical trial. The use of SPD, an emerging research approach in the health care research field [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref17">17</xref>], involves the generation of fictitious individual patient-level data from real data, which possess statistical properties similar to those of actual data. This approach is anticipated to facilitate health care research while addressing data privacy concerns [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Regarding its application in clinical trials, concerns have been raised about the feasibility of generating SPDs with statistical properties similar to those of actual data due to the relatively smaller volume of clinical trial data compared to RWD, such as electronic health records or registry data. However, previous studies [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>] have reported the successful generation of SPDs with statistical properties generally comparable to the actual data, although there are certain limitations. Additionally, with the expansion of clinical trial data-sharing platforms such as ClinicalStudyDataRequest.com, Project Data Sphere, and Vivli, acquiring subject-level clinical trial data has become more accessible. Consequently, advancements in research on the utility of SPD and the expansion of clinical trial data-sharing platforms are expected to have potential applications in clinical trials.</p><p>Our focus lies in the application of this technology in oncology clinical trials that evaluate popular efficacy end points such as overall survival (OS) and progression-free survival (PFS)&#x2013;related survival functions and median survival time (MST) [<xref ref-type="bibr" rid="ref26">26</xref>]. In previous studies on SPD, there has been a notable emphasis on reporting patient background data and single&#x2013;time point data [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. However, research focusing specifically on the relationship between SPD and survival data remains relatively insufficient [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>As the first step in examining our hypothesis that the use of SPD can be beneficial in accelerating health care research, the aim of this study was to determine the most suitable SPD generation method for oncology trials, specifically focusing on both OS and PFS, which are set as the primary evaluation end points in oncology trials. To achieve this goal, we conducted a comparative simulation of 4 generation methods: classification and regression trees (CART) [<xref ref-type="bibr" rid="ref28">28</xref>], random forest (RF) [<xref ref-type="bibr" rid="ref29">29</xref>], Bayesian network (BN) [<xref ref-type="bibr" rid="ref30">30</xref>], and the conditional tabular generative adversarial network (CTGAN) approach [<xref ref-type="bibr" rid="ref31">31</xref>], and the performance of each method was evaluated.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>To generate the SPD, subject-level clinical trial data were obtained from Project Data Sphere for the following 4 clinical trials (<xref ref-type="table" rid="table1">Table 1</xref>): (1) each had a different cancer type, (2) included control arm data, (3) contained both OS and PFS data, and (4) had a ready data format for analysis.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>List of selected oncology clinical trials in this study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ClinicalTrials.gov ID</td><td align="left" valign="bottom">Titles</td><td align="left" valign="bottom">Phase</td><td align="left" valign="bottom">Cancer type</td><td align="left" valign="bottom">Intervention for the control arm</td><td align="left" valign="bottom">Subjects in the control arm, n</td></tr></thead><tbody><tr><td align="left" valign="top">NCT00119613</td><td align="left" valign="top">A Randomized, Double-Blind, Placebo-Controlled Study of Subjects With Previously Untreated Extensive-Stage Small-Cell Lung Cancer (SCLC) Treated With Platinum Plus Etoposide Chemotherapy With or Without Darbepoetin Alfa.</td><td align="left" valign="top">III</td><td align="left" valign="top">Small cell lung cancer</td><td align="left" valign="top">Placebo</td><td align="char" char="." valign="top">232</td></tr><tr><td align="left" valign="top">NCT00339183</td><td align="left" valign="top">A Randomized, Multicenter Phase 3 Study to Compare the Efficacy of Panitumumab in Combination With Chemotherapy to the Efficacy of Chemotherapy Alone in Patients With Previously Treated Metastatic Colorectal Cancer.</td><td align="left" valign="top">III</td><td align="left" valign="top">Metastatic colorectal cancer</td><td align="left" valign="top">FOLFIRI<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> Alone</td><td align="char" char="." valign="top">476</td></tr><tr><td align="left" valign="top">NCT00339183</td><td align="left" valign="top">A Phase 3 Randomized Trial of Chemotherapy With or Without Panitumumab in Patients With Metastatic and/or Recurrent Squamous Cell Carcinoma of the Head and Neck (SCCHN).</td><td align="left" valign="top">III</td><td align="left" valign="top">Recurrent or metastatic (or both) head and neck cancer</td><td align="left" valign="top">Cisplatin and 5-fluorouracil</td><td align="char" char="." valign="top">260</td></tr><tr><td align="left" valign="top">NCT00703326</td><td align="left" valign="top">A Multicenter, Multinational, Randomized, Double-Blind, Phase III Study of IMC-1121B Plus Docetaxel versus Placebo Plus Docetaxel in Previously Untreated Patients With HER2-Negative, Unresectable, Locally-Recurrent or Metastatic Breast Cancer.</td><td align="left" valign="top">III</td><td align="left" valign="top">Breast cancer</td><td align="left" valign="top">Placebo and docetaxel</td><td align="char" char="." valign="top">382</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>FOLFIRI: panitumumab plus fluorouracil, leucovorin, and irinotecan.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2"><title>Preparation of the Training Data Set</title><p>The patient data for the control arm contained within each trial data set were extracted and used as the actual data for the training data set. The selection of variables in the training data set aimed to include as many variables related to the subjects&#x2019; background as possible, excluding variables concerning tests and evaluations conducted during the trials. Furthermore, variables that had the same value were excluded, even if they were related to the subjects&#x2019; background (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref><xref ref-type="supplementary-material" rid="app2"/><xref ref-type="supplementary-material" rid="app3"/>-<xref ref-type="supplementary-material" rid="app4">4</xref>).</p></sec><sec id="s2-3"><title>Generation of Synthetic Data</title><p>The SPDs in this study were generated using the following 4 methods:</p><list list-type="order"><list-item><p>CART: the synthpop package (version 1.8) in R (The R Foundation) was used, specifying the cart method for the syn function&#x2019;s method argument.</p></list-item><list-item><p>RF: the synthpop package (version 1.8) in R was used, specifying the Ranger method for the syn function&#x2019;s method argument.</p></list-item><list-item><p>BN: the bnlearn package (version 4.9) in R was used to conduct structural learning through the score-based algorithm hill-climbing, followed by parameter estimation using the bn.fit function. The default maximum likelihood estimator was used for parameter estimation.</p></list-item><list-item><p>CTGAN: the CTGANSynthesizer module included in the Python package sdv (version 1.3) was used.</p></list-item></list><p>In all these generation methods, to ensure the absence of conflicting data regarding the relationship between PFS and OS, constraints were set to ensure that the values of PFS and OS were greater than zero and that PFS was less than or equal to OS. Specific individual patient data in the generated SPD, which did not meet these constraints, were excluded, and new individual patient data were regenerated. The SPDs were generated in a manner that equaled the number of subject-level data to the record count in the actual data.</p><p>To ensure the reproducibility of SPD generation, 1000 random numbers were generated as seed values using the Mersenne Twister algorithm. The same seed value set was used for all generation methods.</p></sec><sec id="s2-4"><title>Statistical Analysis</title><sec id="s2-4-1"><title>Histogram</title><p>Histograms were created to visually inspect the distributions of the MST of the synthetic data (MSTS) for PFS and OS for the 1000 SPD data sets generated by each method. The histograms also included the MST of the actual data (MSTA) as a vertical line and the range of its 95% CI as a rectangular background. For PFS and OS, a higher percentage of MSTS covered by the 95% CI of the MSTA was determined to indicate a greater level of reliability for the generation method.</p></sec><sec id="s2-4-2"><title>Evaluation of Similarity</title><p>A hazard ratio (HR) of 1 signifies that the 2 survival functions are entirely identical. Thus, the closer the HR is to 1, the more similar the 2 survival functions are. Accordingly, based on the following calculation formula, the HR distance (HRD) for PFS and OS from the SPD and the actual data were computed and evaluated:</p><disp-formula id="E1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">H</mml:mi><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">D</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">b</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">H</mml:mi><mml:mi mathvariant="normal">R</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-4-3"><title>Kaplan-Meier Plot</title><p>In the evaluation of similarity, the SPD that showed the highest HRD value was considered the best case, and the SPD with the lowest HRD value was considered the worst case. Three groups of Kaplan-Meier (KM) plots were created, including the actual data, the best case, and the worst case for each SPD generation method. The best case and worst case for each SPD generation method in both PFS and OS were compared to actual survival by using the log rank test. Multiple comparisons were not performed, nor were <italic>P</italic> values adjusted because controlling for the type I error rate does not affect the conclusions of this study.</p><p>Since the purpose of this study was to evaluate the method of generating SPD that closely resemble actual survival data, it might be unnecessary to calculate a <italic>P</italic> value that indicates a significant difference from actual survival, but the <italic>P</italic> value was calculated in this study from the viewpoint that if a significant difference is also observed in the best-case, that method should not be adopted.</p><p>All analyses and data generation were performed using R (version 4.3.1; The R Foundation) and Python (version 3.10; Python Software Foundation).</p></sec></sec><sec id="s2-5"><title>Ethical Considerations</title><p>Ethical review was not needed for this simulation study for methodology comparison. All actual clinical trial data sets obtained from Project Data Sphere were used in accordance with relevant guidelines and regulations when the clinical trials were conducted.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows a histogram of the MSTS for PFS in the NCT00703326 trial. Using CART, RF, and BN, most of the generated MSTS values were within the 95% CI of the MSTA. In contrast, when CTGAN was used, SPD generation resulted in a widened variance in the distribution of MSTS. For the MSTS of PFS in the other 3 trials, RF exhibited a shift in the distribution of the MSTS, shortening the survival period, while BN displayed a shift in the distribution and prolonged the survival period. Similar trends to <xref ref-type="fig" rid="figure1">Figure 1</xref> were observed for CART and CTGAN (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendices 5</xref><xref ref-type="supplementary-material" rid="app6"/>-<xref ref-type="supplementary-material" rid="app7">7</xref>).</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> displays a histogram of the MSTS for OS in the NCT00460265 trial. The divergence from the PFS findings is that the MSTS of RF was more frequently included within the 95% CI of the MSTA, with similar results observed in other trials (<xref ref-type="supplementary-material" rid="app8">Multimedia Appendices 8</xref><xref ref-type="supplementary-material" rid="app9"/>-<xref ref-type="supplementary-material" rid="app10">10</xref>). In other aspects, similar findings were obtained as with the PFS.</p><p><xref ref-type="table" rid="table2">Table 2</xref> presents the number and proportion of the generated MSTS values included within the 95% CI of the MSTA for each trial and each method. In the case of CART for PFS, a high percentage ranging from 88.8% to 98.1% was exhibited for all trials. However, the OS ranged from 60.8% to 96.1%, with some trials displaying a lower percentage than the PFS.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Histogram of the median survival time of the synthetic data for progression-free survival in the NCT00703326 trial. The dashed vertical line represents the median survival time of the actual data, and the light blue background indicates its 95% CI. BN: Bayesian network; CART: classification and regression tree; CTGAN: conditional tabular generative adversarial network; MST: median survival time; RF: random forest.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e55118_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Histogram of the median survival time of the synthetic data of overall survival in the NCT00460265 trial. The dashed vertical line represents the median survival time of the actual data, and the light blue background indicates its 95% CI. BN: Bayesian network; CART: classification and regression tree; CTGAN: conditional tabular generative adversarial network; MST: median survival time; RF: random forest.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e55118_fig02.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>The number and proportion of median survival times of the synthetic data (MSTSs) falling within the 95% CI of the median survival time of the actual data (MSTA).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2"/><td align="left" valign="bottom" colspan="4">ClinicalTrials.gov ID</td></tr><tr><td align="left" valign="bottom" colspan="2"/><td align="left" valign="bottom">NCT00119613</td><td align="left" valign="bottom">NCT00339183</td><td align="left" valign="bottom">NCT00460265</td><td align="left" valign="bottom">NCT00703326</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6"><bold>Progression-free survival</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">MSTA (95% CI)</td><td align="left" valign="top">169 (163-183)</td><td align="left" valign="top">155 (121-168)</td><td align="left" valign="top">133 (121-167)</td><td align="left" valign="top">424 (380-504)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>MSTSs, n (%)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">CART<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> (n=1000)</td><td align="left" valign="top">981 (98.1)</td><td align="left" valign="top">888 (88.8)</td><td align="left" valign="top">955 (95.5)</td><td align="left" valign="top">918 (91.8)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (n=1000)</td><td align="left" valign="top">693 (69.3)</td><td align="left" valign="top">248 (24.8)</td><td align="left" valign="top">426 (42.6)</td><td align="left" valign="top">919 (91.9)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">BN<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> (n=1000)</td><td align="left" valign="top">10 (1.0)</td><td align="left" valign="top">0 (0.0)</td><td align="left" valign="top">37 (3.7)</td><td align="left" valign="top">976 (97.6)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">CTGAN<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> (n=1000)</td><td align="left" valign="top">65 (6.5)</td><td align="left" valign="top">378 (37.8)</td><td align="left" valign="top">322 (32.2)</td><td align="left" valign="top">254 (25.5)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>Overall survival</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">MSTA (95% CI)</td><td align="left" valign="top">276 (259-303)</td><td align="left" valign="top">361 (319-393)</td><td align="left" valign="top">286 (255-357)</td><td align="left" valign="top">1452 (1417-1507)</td></tr><tr><td align="left" valign="top" colspan="6"><bold>MSTSs, n (%)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">CART (n=1000)</td><td align="left" valign="top">831 (83.1)</td><td align="left" valign="top">608 (60.8)</td><td align="left" valign="top">719 (71.9)</td><td align="left" valign="top">961 (96.1)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">RF (n=1000)</td><td align="left" valign="top">757 (75.7)</td><td align="left" valign="top">697 (69.7)</td><td align="left" valign="top">980 (98.0)</td><td align="left" valign="top">599 (59.9)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">BN (n=1000)</td><td align="left" valign="top">0 (0.0)</td><td align="left" valign="top">0 (0.0)</td><td align="left" valign="top">0 (0.0)</td><td align="left" valign="top">622 (62.2)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">CTGAN (n=1000)</td><td align="left" valign="top">72 (7.2)</td><td align="left" valign="top">155 (15.5)</td><td align="left" valign="top">197 (19.7)</td><td align="left" valign="top">81 (8.5)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>CART: classification and regression tree.</p></fn><fn id="table2fn2"><p><sup>b</sup>RF: random forest.</p></fn><fn id="table2fn3"><p><sup>c</sup>BN: Bayesian network.</p></fn><fn id="table2fn4"><p><sup>d</sup>CTGAN: conditional tabular generative adversarial network.</p></fn></table-wrap-foot></table-wrap><p>For RF, a high proportion of 91.9% was observed for PFS in the NCT00703326 trial and 98.0% for OS in the NCT00460265 trial, whereas in other cases, the proportion for RF was not as high as that for CART.</p><p>In the case of BN, proportions of 97.6% and 62.2% were observed for PFS and OS, respectively, in the NCT00703326 trial, but in the other 3 trials, BN showed an extremely low percentage ranging from proportion ranging from 0.0% to 3.7%.</p><p>CTGAN showed a low proportion ranging from 6.5% to 37.8% for both PFS and OS in all trials.</p><p><xref ref-type="fig" rid="figure3">Figure 3</xref> shows the KM plot for PFS in the NCT00703326 trial. The best-case curves of CART and RF were similar to the actual data curve. In contrast, for BN and CTGAN, even the best-case curves deviated from the actual data curve. In other trials, some SPD did not show a similar trend. However, at least for the best-case scenarios of CART and RF, the generated synthetic survival curves closely resembled the actual survival curve (<xref ref-type="supplementary-material" rid="app11">Multimedia Appendices 11</xref><xref ref-type="supplementary-material" rid="app12"/>-<xref ref-type="supplementary-material" rid="app13">13</xref>).</p><p><xref ref-type="fig" rid="figure4">Figure 4</xref> displays the KM plot for OS in the NCT00460265 trial. Similar to the KM plots for PFS, the best-case curves of CART and RF resembled the actual data curve, whereas those of BN and CTGAN deviated from the actual data curve. These trends were also observed in other trials (<xref ref-type="supplementary-material" rid="app14">Multimedia Appendices 14</xref><xref ref-type="supplementary-material" rid="app15"/>-<xref ref-type="supplementary-material" rid="app16">16</xref>).</p><p><xref ref-type="fig" rid="figure5">Figures 5</xref> and <xref ref-type="fig" rid="figure6">6</xref> present box plots of the HRD. When using CART, the HRD values for both PFS and OS in all trials were concentrated at approximately 0.9. Conversely, for the other methods, no consistent trend was observed for either PFS or OS.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Kaplan-Meier plots for progression-free survival in the NCT00703326 trial. BN: Bayesian network; CART: classification and regression tree; CTGAN: conditional tabular generative adversarial network; RF: random forest.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e55118_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Kaplan-Meier plots for overall survival in the NCT00460265 trial. BN: Bayesian network; CART: classification and regression tree; CTGAN: conditional tabular generative adversarial network; RF: random forest.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e55118_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Box plot of progression-free survival hazard ratio distance (HRD) for each method and clinical trial. BN: Bayesian network; CART: classification and regression tree; CTGAN: conditional tabular generative adversarial network; RF: random forest.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e55118_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Box plot of overall survival hazard ratio distance (HRD) for each method and clinical trial. BN: Bayesian network; CART: classification and regression tree; CTGAN: conditional tabular generative adversarial network; RF: random forest.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e55118_fig06.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Regarding the survival SPD, CART often yielded better results than the other methods in evaluations using MST, HRD, and visual analysis via KM plots. Given the crucial importance of the hazard ratio and MST as end points in oncology trials [<xref ref-type="bibr" rid="ref26">26</xref>], demonstrating the utility of both of these evaluation metrics is essential. Therefore, using CART for generating survival SPD was suggested as a beneficial approach.</p><p>While both CART and RF generally yielded preferable results across all trials, they share the common characteristic of using tree models. RF, with its use of the bootstrap method for resampling and constructing tree models for ensemble learning, is known to prevent overfitting. In general, in terms of constructing machine learning models with high generalization performance, RF performs better than CART. However, CART is prone to overfitting as the layers of the tree become deeper [<xref ref-type="bibr" rid="ref32">32</xref>]. Although RF is considered a superior method for constructing high&#x2013;generalization-performance machine learning models, the results from <xref ref-type="table" rid="table2">Table 2</xref> and the KM plots in this study suggest that CART is a better approach than RF. This discrepancy might be due to differing views on what is a higher performance between the machine learning prediction model and SPD. In the machine learning prediction model, it is important to prevent overfitting and reduce bias; however, SPD is expected to match its statistical properties with actual data. Thus, in the case of SPD, the overfitting suppression mechanism possessed by RF might have resulted in inferiority to that of CART from the perspective of improving similarity.</p><p>In the case of using BN, the percentage of MSTSs falling within the 95% CI of MSTAs was 0% for the PFS of the NCT00339183 trial, and for OS, this phenomenon also occurred in the NCT00119613, NCT00339183, and NCT0046265 trials. This implies that the SPD failed to accurately reflect the statistical properties of the actual data. Conversely, a high value of 97.6% was observed for the PFS in the NCT00703326 trial. The reason for this discrepancy could not be determined on the basis of the results of this study. Tucker et al [<xref ref-type="bibr" rid="ref24">24</xref>] reported that they could generate data highly similar to actual data when using BN for the generation of SPD, which differs from our findings. One notable difference is that while Tucker et al [<xref ref-type="bibr" rid="ref24">24</xref>] used a large-scale actual data set of 27.5 million patients for their study, this study used only a few hundred patients for training data. This difference likely had a significant impact on the accuracy of the SPD generation model, resulting in conflicting results. However, the SPD generated by BN were not distributed in the direction of shortening PFS or OS. Thus, this would not be harmful when the SPD generated by BN is used as a more conservative control arm in clinical trials.</p><p>Using CTGAN, the percentage of the MSTSs falling within the 95% CI of the actual data was low, indicating low performance associated with the generation of SPD that reflect the statistical properties of the actual data. However, Krenmayr et al [<xref ref-type="bibr" rid="ref23">23</xref>] reported favorable performance results when using the same generative adversarial network (GAN)&#x2013;based methods and RWD. The differences between their study and our study were as follows: their study did not include SPD on survival time or generate multiple SPD data sets from the same actual data, and there was a large amount of individual patient data in their study. In particular, focusing on the amount of individual patient data, the number of patients in each trial included in this study was relatively small, with the NCT00119613 trial having 232 patients, the NCT00339183 trial having 476 patients, the NCT0046265 trial having 260 patients, and the NCT00703326 trial having 382 patients, while the trial conducted by Krenmayr et al [<xref ref-type="bibr" rid="ref23">23</xref>] had 500 or more patients. GAN-based methods using deep neural networks are known to perform poorly with small amounts of data [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. In this study, although the NCT00339183 trial had the largest number of individual patient data, the best case of CTGAN for NCT00339183 produced a KM plot similar to the actual data, suggesting that a larger data set yields better results. Thus, there is no contradiction. Another characteristic of using CTGAN in this study was the larger variance in the estimated MSTSs, as indicated in <xref ref-type="fig" rid="figure1">Figures 1</xref> and <xref ref-type="fig" rid="figure2">2</xref>. Goncalves et al [<xref ref-type="bibr" rid="ref34">34</xref>] showed that using MC-MedGAN, a GAN-based method, to generate an SPD from small data resulted in a large SD of the data utility metrics, leading to results with larger variance, similar to those of this study. Therefore, it is extremely challenging to generate useful SPD by applying GAN-based methods to small data sets, such as clinical trial data.</p><p>When generating SPDs for survival data and using them as a certain arm in a clinical trial, it is important to verify that the statistical properties closely match those of the actual data with the MST and the hazard ratio with the actual data being close to 1. Based on our results, we conclude that CART, which can concentrate the MSTSs within the range of 95% CI of MSTAs and approximately 0.9 for HRD, is an efficient method for generating SPD that meets the abovementioned conditions. However, even when using CART, slight variations were observed in the MSTSs, and some cases fell outside the 95% CI of the MSTAs, as revealed by our results. Therefore, for practical use, it is necessary to verify that the MSTSs are included in the 95% CI of the MSTAs and that both are close in value. It is also necessary to verify whether the HRD of the actual data and the SPD are close to 1 and then decide whether to adopt the generated SPD. Hence, the generation process must be repeated until an acceptable SPD is obtained. There may also be a need to use statistical methods to match characteristics between the SPD and the actual treatment arm in clinical trials.</p><p>In this study, even the most useful CART method produced SPDs that did not meet the requirements of MST and HRD. We expect that this issue will be addressed by incorporating feature engineering, such as dimension reduction, imputing missing values, derived variable creation, and other processing. Additionally, in clinical research, as subgroup analyses are frequently conducted, it is necessary to improve the generation method to reflect the statistical properties of the actual data even when the data are divided into subgroups under certain conditions. Moreover, from the perspective of data privacy, it is essential to incorporate approaches to prevent data reidentification into the generation method [<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>In conclusion, as a method for generating SPD for survival data from small data sets, such as clinical trial data, CART is the most effective method for generating SPD that meet the 2 conditions of having an MSTSs close to the MSTAs and an HRD close to 1. However, as SPD might be generated, which do not meet these 2 conditions, it is necessary to incorporate mechanisms to improve a CART-based generation method in future studies. Overcoming these challenges would make it possible to reduce the recruitment period and costs of clinical trial participants to &#x2265;50% in comparative trials of new drug development against existing therapeutic drugs. This approach could accelerate clinical development, similar to the use of RWD.</p></sec></body><back><ack><p>We would like to express our gratitude to Project Data Sphere, the platform that provided the necessary data for this study, and to the clinical trial data providers Amgen and Eli Lilly.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BN</term><def><p>Bayesian network</p></def></def-item><def-item><term id="abb2">CART</term><def><p>classification and regression trees</p></def></def-item><def-item><term id="abb3">CTGAN</term><def><p>conditional tabular generative adversarial network</p></def></def-item><def-item><term id="abb4">GAN</term><def><p>generative adversarial network</p></def></def-item><def-item><term id="abb5">HR</term><def><p>hazard ratio</p></def></def-item><def-item><term id="abb6">HRD</term><def><p>hazard ratio distance</p></def></def-item><def-item><term id="abb7">KM</term><def><p>Kaplan&#x2012;Meier</p></def></def-item><def-item><term id="abb8">MST</term><def><p>median survival time</p></def></def-item><def-item><term id="abb9">MSTA</term><def><p>median survival time of actual data</p></def></def-item><def-item><term id="abb10">MSTS</term><def><p>median survival time of synthetic data</p></def></def-item><def-item><term id="abb11">OS</term><def><p>overall survival</p></def></def-item><def-item><term id="abb12">PFD</term><def><p>progression-free survival</p></def></def-item><def-item><term id="abb13">RF</term><def><p>random forest</p></def></def-item><def-item><term id="abb14">RWD</term><def><p>real-world data</p></def></def-item><def-item><term id="abb15">SPD</term><def><p>synthetic patient data</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>GD</given-names> </name><name name-style="western"><surname>Bull</surname><given-names>J</given-names> </name><name name-style="western"><surname>Johnston McKee</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Clinical trials recruitment planning: a proposed framework from the clinical trials transformation initiative</article-title><source>Contemp Clin Trials</source><year>2018</year><month>03</month><volume>66</volume><fpage>74</fpage><lpage>79</lpage><pub-id pub-id-type="doi">10.1016/j.cct.2018.01.003</pub-id><pub-id pub-id-type="medline">29330082</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fogel</surname><given-names>DB</given-names> </name></person-group><article-title>Factors associated with clinical trials that fail and opportunities for improving the likelihood of success: a review</article-title><source>Contemp Clin Trials Commun</source><year>2018</year><month>09</month><volume>11</volume><fpage>156</fpage><lpage>164</lpage><pub-id pub-id-type="doi">10.1016/j.conctc.2018.08.001</pub-id><pub-id pub-id-type="medline">30112460</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Treweek</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lockhart</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pitkethly</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Methods to improve recruitment to randomised controlled trials: Cochrane systematic review and meta-analysis</article-title><source>BMJ Open</source><year>2013</year><volume>3</volume><issue>2</issue><fpage>e002360</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2012-002360</pub-id><pub-id pub-id-type="medline">23396504</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="web"><article-title>Considerations for the design and conduct of externally controlled trials for drug and biological products. Guidance for industry</article-title><source>US Food and Drug Administration</source><year>2023</year><access-date>2024-06-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/media/164960/download">https://www.fda.gov/media/164960/download</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yap</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Jacobs</surname><given-names>I</given-names> </name><name name-style="western"><surname>Baumfeld Andre</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>LJ</given-names> </name><name name-style="western"><surname>Beaupre</surname><given-names>D</given-names> </name><name name-style="western"><surname>Azoulay</surname><given-names>L</given-names> </name></person-group><article-title>Application of real-world data to external control groups in oncology clinical trial drug development</article-title><source>Front Oncol</source><year>2021</year><volume>11</volume><fpage>695936</fpage><pub-id pub-id-type="doi">10.3389/fonc.2021.695936</pub-id><pub-id pub-id-type="medline">35070951</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dagenais</surname><given-names>S</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Madsen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Webster</surname><given-names>J</given-names> </name><name name-style="western"><surname>Becnel</surname><given-names>L</given-names> </name></person-group><article-title>Use of real&#x2010;world evidence to drive drug development strategy and inform clinical trial design</article-title><source>Clin Pharmacol Ther</source><year>2022</year><month>01</month><volume>111</volume><issue>1</issue><fpage>77</fpage><lpage>89</lpage><pub-id pub-id-type="doi">10.1002/cpt.2480</pub-id><pub-id pub-id-type="medline">34839524</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rizzo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Whipple</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Evaluating eligibility criteria of oncology trials using real-world data and AI</article-title><source>Nature</source><year>2021</year><month>04</month><volume>592</volume><issue>7855</issue><fpage>629</fpage><lpage>633</lpage><pub-id pub-id-type="doi">10.1038/s41586-021-03430-5</pub-id><pub-id pub-id-type="medline">33828294</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azizi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lindner</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shiba</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A comparison of synthetic data generation and federated analysis for enabling international evaluations of cardiovascular health</article-title><source>Sci Rep</source><year>2023</year><month>07</month><day>17</day><volume>13</volume><issue>1</issue><fpage>11540</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-38457-3</pub-id><pub-id pub-id-type="medline">37460705</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jonker</surname><given-names>E</given-names> </name><name name-style="western"><surname>Arbuckle</surname><given-names>L</given-names> </name><name name-style="western"><surname>Malin</surname><given-names>B</given-names> </name></person-group><article-title>A systematic review of re-identification attacks on health data</article-title><source>PLoS One</source><year>2011</year><volume>6</volume><issue>12</issue><fpage>e28071</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0028071</pub-id><pub-id pub-id-type="medline">22164229</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaur</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sobiesk</surname><given-names>M</given-names> </name><name name-style="western"><surname>Patil</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Application of Bayesian networks to generate synthetic health data</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>03</month><day>18</day><volume>28</volume><issue>4</issue><fpage>801</fpage><lpage>811</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa303</pub-id><pub-id pub-id-type="medline">33367620</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mavrogenis</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Scarlat</surname><given-names>MM</given-names> </name></person-group><article-title>Artificial intelligence publications: synthetic data, patients, and papers</article-title><source>Int Orthop</source><year>2023</year><month>06</month><volume>47</volume><issue>6</issue><fpage>1395</fpage><lpage>1396</lpage><pub-id pub-id-type="doi">10.1007/s00264-023-05830-w</pub-id><pub-id pub-id-type="medline">37162553</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meeker</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kallem</surname><given-names>C</given-names> </name><name name-style="western"><surname>Heras</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Garcia</surname><given-names>S</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>C</given-names> </name></person-group><article-title>Case report: evaluation of an open-source synthetic data platform for simulation studies</article-title><source>JAMIA Open</source><year>2022</year><month>10</month><volume>5</volume><issue>3</issue><fpage>ac067</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooac067</pub-id><pub-id pub-id-type="medline">35958672</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brownstein</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Marathe</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Combining participatory influenza surveillance with modeling and forecasting: three alternative approaches</article-title><source>JMIR Public Health Surveill</source><year>2017</year><month>11</month><day>1</day><volume>3</volume><issue>4</issue><fpage>e83</fpage><pub-id pub-id-type="doi">10.2196/publichealth.7344</pub-id><pub-id pub-id-type="medline">29092812</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guillaudeux</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rousseau</surname><given-names>O</given-names> </name><name name-style="western"><surname>Petot</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Patient-centric synthetic data generation, no reason to risk re-identification in biomedical data analysis</article-title><source>NPJ Digit Med</source><year>2023</year><month>03</month><day>10</day><volume>6</volume><issue>1</issue><fpage>37</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00771-5</pub-id><pub-id pub-id-type="medline">36899082</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name></person-group><article-title>Status of synthetic data generation for structured health data</article-title><source>JCO Clin Cancer Inform</source><year>2023</year><month>06</month><volume>7</volume><fpage>e2300071</fpage><pub-id pub-id-type="doi">10.1200/CCI.23.00071</pub-id><pub-id pub-id-type="medline">37390378</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>D&#x2019;Amico</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dall&#x2019;Olio</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sala</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Synthetic data generation by artificial intelligence to accelerate research and precision medicine in hematology</article-title><source>JCO Clin Cancer Inform</source><year>2023</year><month>06</month><volume>7</volume><fpage>e2300021</fpage><pub-id pub-id-type="doi">10.1200/CCI.23.00021</pub-id><pub-id pub-id-type="medline">37390377</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gonzales</surname><given-names>A</given-names> </name><name name-style="western"><surname>Guruswamy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>SR</given-names> </name></person-group><article-title>Synthetic data in health care: a narrative review</article-title><source>PLOS Digit Health</source><year>2023</year><month>01</month><volume>2</volume><issue>1</issue><fpage>e0000082</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000082</pub-id><pub-id pub-id-type="medline">36812604</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giuffr&#x00E8;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shung</surname><given-names>DL</given-names> </name></person-group><article-title>Harnessing the power of synthetic data in healthcare: innovation, application, and privacy</article-title><source>NPJ Digit Med</source><year>2023</year><month>10</month><day>9</day><volume>6</volume><issue>1</issue><fpage>186</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00927-3</pub-id><pub-id pub-id-type="medline">37813960</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ursin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mottu</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Nyg&#x00E5;rd</surname><given-names>M</given-names> </name></person-group><article-title>Protecting privacy in large datasets&#x2014;first we assess the risk; then we fuzzy the data</article-title><source>Cancer Epidemiol Biomarkers Prev</source><year>2017</year><month>08</month><day>1</day><volume>26</volume><issue>8</issue><fpage>1219</fpage><lpage>1224</lpage><pub-id pub-id-type="doi">10.1158/1055-9965.EPI-17-0172</pub-id><pub-id pub-id-type="medline">28754793</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rankin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Black</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bond</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mulvenna</surname><given-names>M</given-names> </name><name name-style="western"><surname>Epelde</surname><given-names>G</given-names> </name></person-group><article-title>Reliability of supervised machine learning using synthetic data in health care: model to preserve privacy for data sharing</article-title><source>JMIR Med Inform</source><year>2020</year><month>07</month><day>20</day><volume>8</volume><issue>7</issue><fpage>e18910</fpage><pub-id pub-id-type="doi">10.2196/18910</pub-id><pub-id pub-id-type="medline">32501278</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Summers</surname><given-names>C</given-names> </name><name name-style="western"><surname>Griffiths</surname><given-names>F</given-names> </name><name name-style="western"><surname>Cave</surname><given-names>J</given-names> </name><name name-style="western"><surname>Panesar</surname><given-names>A</given-names> </name></person-group><article-title>Understanding the security and privacy concerns about the use of identifiable health data in the context of the COVID-19 pandemic: survey study of public attitudes toward COVID-19 and data-sharing</article-title><source>JMIR Form Res</source><year>2022</year><month>07</month><day>7</day><volume>6</volume><issue>7</issue><fpage>e29337</fpage><pub-id pub-id-type="doi">10.2196/29337</pub-id><pub-id pub-id-type="medline">35609306</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azizi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mosquera</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pilote</surname><given-names>L</given-names> </name><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name><collab>GOING-FWD Collaborators</collab></person-group><article-title>Can synthetic data be a proxy for real clinical trial data? A validation study</article-title><source>BMJ Open</source><year>2021</year><month>04</month><day>16</day><volume>11</volume><issue>4</issue><fpage>e043497</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-043497</pub-id><pub-id pub-id-type="medline">33863713</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krenmayr</surname><given-names>L</given-names> </name><name name-style="western"><surname>Frank</surname><given-names>R</given-names> </name><name name-style="western"><surname>Drobig</surname><given-names>C</given-names> </name><etal/></person-group><article-title>GANerAid: realistic synthetic patient data for clinical trials</article-title><source>Inform Med Unlocked</source><year>2022</year><volume>35</volume><fpage>101118</fpage><pub-id pub-id-type="doi">10.1016/j.imu.2022.101118</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tucker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Rotalinti</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Myles</surname><given-names>P</given-names> </name></person-group><article-title>Generating high-fidelity synthetic patient data for assessing machine learning healthcare software</article-title><source>NPJ Digit Med</source><year>2020</year><month>11</month><day>9</day><volume>3</volume><issue>1</issue><fpage>147</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-00353-9</pub-id><pub-id pub-id-type="medline">33299100</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Santos</surname><given-names>M</given-names> </name></person-group><article-title>How to generate real-world synthetic data with CTGAN</article-title><source>Medium</source><year>2023</year><access-date>2024-06-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://medium.com/towards-data-science/how-to-generate-real-world-synthetic-data-with-ctgan-af41b4d60fde">https://medium.com/towards-data-science/how-to-generate-real-world-synthetic-data-with-ctgan-af41b4d60fde</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ben-Aharon</surname><given-names>O</given-names> </name><name name-style="western"><surname>Magnezi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Leshno</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>DA</given-names> </name></person-group><article-title>Median survival or mean survival: which measure is the most appropriate for patients, physicians, and policymakers?</article-title><source>Oncologist</source><year>2019</year><month>11</month><volume>24</volume><issue>11</issue><fpage>1469</fpage><lpage>1478</lpage><pub-id pub-id-type="doi">10.1634/theoncologist.2019-0175</pub-id><pub-id pub-id-type="medline">31320502</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lambert</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Rutherford</surname><given-names>MJ</given-names> </name></person-group><article-title>Generating high-fidelity synthetic time-to-event datasets to improve data transparency and accessibility</article-title><source>BMC Med Res Methodol</source><year>2022</year><month>06</month><day>23</day><volume>22</volume><issue>1</issue><fpage>176</fpage><pub-id pub-id-type="doi">10.1186/s12874-022-01654-1</pub-id><pub-id pub-id-type="medline">35739465</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="editor"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><source>Classification and Regression Trees</source><year>1998</year><publisher-name>Chapman and Hall</publisher-name></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pearl</surname><given-names>J</given-names> </name></person-group><article-title>Bayesian networks: a model of self-activated memory for evidential reasoning</article-title><access-date>2024-06-04</access-date><conf-name>Proceedings of the 7th Conference of the Cognitive Science Society</conf-name><conf-date>Aug 15 to 17, 1985</conf-date><conf-loc>Irvine, CA</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://ftp.cs.ucla.edu/tech-report/198_-reports/850017.pdf">https://ftp.cs.ucla.edu/tech-report/198_-reports/850017.pdf</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Skoularidou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cuesta-Infante</surname><given-names>A</given-names> </name><name name-style="western"><surname>Veeramachaneni</surname><given-names>K</given-names> </name></person-group><article-title>Modeling tabular data using conditional GAN</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 1, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.00503</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hayes</surname><given-names>T</given-names> </name><name name-style="western"><surname>Usami</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jacobucci</surname><given-names>R</given-names> </name><name name-style="western"><surname>McArdle</surname><given-names>JJ</given-names> </name></person-group><article-title>Using classification and regression trees (CART) and random forests to analyze attrition: results from two simulations</article-title><source>Psychol Aging</source><year>2015</year><month>12</month><volume>30</volume><issue>4</issue><fpage>911</fpage><lpage>929</lpage><pub-id pub-id-type="doi">10.1037/pag0000046</pub-id><pub-id pub-id-type="medline">26389526</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><name name-style="western"><surname>Goodfellow</surname><given-names>I</given-names> </name><name name-style="western"><surname>Zaremba</surname><given-names>W</given-names> </name><name name-style="western"><surname>Cheung</surname><given-names>V</given-names> </name><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name></person-group><article-title>Improved techniques for training GANs</article-title><source>arXiv</source><access-date>2024-06-04</access-date><comment>Preprint posted online on  Jun 10, 2016</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1606.03498">http://arxiv.org/abs/1606.03498</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.1606.03498</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goncalves</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ray</surname><given-names>P</given-names> </name><name name-style="western"><surname>Soper</surname><given-names>B</given-names> </name><name name-style="western"><surname>Stevens</surname><given-names>J</given-names> </name><name name-style="western"><surname>Coyle</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sales</surname><given-names>AP</given-names> </name></person-group><article-title>Generation and evaluation of synthetic patient data</article-title><source>BMC Med Res Methodol</source><year>2020</year><month>05</month><day>7</day><volume>20</volume><issue>1</issue><fpage>108</fpage><pub-id pub-id-type="doi">10.1186/s12874-020-00977-1</pub-id><pub-id pub-id-type="medline">32381039</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mosquera</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hoptroff</surname><given-names>R</given-names> </name></person-group><source>Practical Synthetic Data Generation: Balancing Privacy and the Broad Availability of Data</source><year>2020</year><publisher-name>O&#x2019;Reilly Media</publisher-name></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Variables used to generate synthetic patient data from the NCT00119613 trial.</p><media xlink:href="medinform_v12i1e55118_app1.docx" xlink:title="DOCX File, 48 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Variables used to generate synthetic patient data from the NCT00339183 trial.</p><media xlink:href="medinform_v12i1e55118_app2.docx" xlink:title="DOCX File, 48 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Variables used to generate synthetic patient data from the NCT00460265 trial.</p><media xlink:href="medinform_v12i1e55118_app3.docx" xlink:title="DOCX File, 47 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Variables used for generating synthetic patient data from the NCT00703326 trial.</p><media xlink:href="medinform_v12i1e55118_app4.docx" xlink:title="DOCX File, 48 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Histogram of the median survival time of the synthetic data for progression-free survival in the NCT00119613 trial. The dashed vertical line represents the median survival time for the actual data, and the light blue background indicates its 95% CI.</p><media xlink:href="medinform_v12i1e55118_app5.docx" xlink:title="DOCX File, 202 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Histogram of the median survival times for the synthetic data for progression-free survival in the NCT00339183 trial. The dashed vertical line represents the median survival time of the actual data, and the light blue background indicates its 95% CI.</p><media xlink:href="medinform_v12i1e55118_app6.docx" xlink:title="DOCX File, 192 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Histogram of the median survival times of the synthetic data for progression-free survival in the NCT00460265 trial. The dashed vertical line represents the median survival time of the actual data, and the light blue background indicates its 95% CI.</p><media xlink:href="medinform_v12i1e55118_app7.docx" xlink:title="DOCX File, 187 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Histogram of the median survival times of the synthetic data for overall survival in the NCT00119613 trial. The dashed vertical line represents the median survival time of the actual data, and the light blue background indicates its 95% CI.</p><media xlink:href="medinform_v12i1e55118_app8.docx" xlink:title="DOCX File, 186 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Histogram of the median survival times of the synthetic data for overall survival in the NCT00339183 trial. The dashed vertical line represents the median survival time of the actual data, and the light blue background indicates its 95% CI.</p><media xlink:href="medinform_v12i1e55118_app9.docx" xlink:title="DOCX File, 195 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Histogram of the median survival times of the synthetic data for overall survival in the NCT00703326 trial. The dashed vertical line represents the median survival time of the actual data, and the light blue background indicates its 95% CI.</p><media xlink:href="medinform_v12i1e55118_app10.docx" xlink:title="DOCX File, 188 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Kaplan-Meier plots for progression-free survival in the NCT00119613 trial.</p><media xlink:href="medinform_v12i1e55118_app11.docx" xlink:title="DOCX File, 215 KB"/></supplementary-material><supplementary-material id="app12"><label>Multimedia Appendix 12</label><p>Kaplan-Meier plots for progression-free survival in the NCT00339183 trial.</p><media xlink:href="medinform_v12i1e55118_app12.docx" xlink:title="DOCX File, 229 KB"/></supplementary-material><supplementary-material id="app13"><label>Multimedia Appendix 13</label><p>Kaplan-Meier plots for progression-free survival in the NCT00460265 trial.</p><media xlink:href="medinform_v12i1e55118_app13.docx" xlink:title="DOCX File, 218 KB"/></supplementary-material><supplementary-material id="app14"><label>Multimedia Appendix 14</label><p>Kaplan-Meier plots for overall survival in the NCT00119613 trial.</p><media xlink:href="medinform_v12i1e55118_app14.docx" xlink:title="DOCX File, 229 KB"/></supplementary-material><supplementary-material id="app15"><label>Multimedia Appendix 15</label><p>Kaplan-Meier plots for overall survival in the NCT00339183 trial.</p><media xlink:href="medinform_v12i1e55118_app15.docx" xlink:title="DOCX File, 252 KB"/></supplementary-material><supplementary-material id="app16"><label>Multimedia Appendix 16</label><p>Kaplan-Meier plots for overall survival in the NCT00703326 trial.</p><media xlink:href="medinform_v12i1e55118_app16.docx" xlink:title="DOCX File, 265 KB"/></supplementary-material></app-group></back></article>