<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="review-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v11i1e38266</article-id>
      <article-id pub-id-type="pmid">36649070</article-id>
      <article-id pub-id-type="doi">10.2196/38266</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Review</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Review</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Scalable Causal Structure Learning: Scoping Review of Traditional and Deep Learning Algorithms and New Opportunities in Biomedicine</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Banf</surname>
            <given-names>Michael</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Iyer</surname>
            <given-names>Ravi</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Upadhyaya</surname>
            <given-names>Pulakesh</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Department of Biomedical Informatics</institution>
            <institution>Emory University School of Medicine</institution>
            <addr-line>101 Woodruff Circle</addr-line>
            <addr-line>Suite 4127</addr-line>
            <addr-line>Atlanta, GA, 30322</addr-line>
            <country>United States</country>
            <phone>1 9794225161</phone>
            <email>pulakeshupadhyaya@gmail.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1054-1380</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Kai</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4519-609X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Can</given-names>
          </name>
          <degrees>MSPH</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2867-7833</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Jiang</surname>
            <given-names>Xiaoqian</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9933-2205</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>Yejin</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7815-6310</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Biomedical Informatics</institution>
        <institution>University of Texas Health Science Center at Houston</institution>
        <addr-line>HOUSTON, TX</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>Emory University School of Medicine</institution>
        <addr-line>Atlanta, GA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Pulakesh Upadhyaya <email>pulakeshupadhyaya@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>1</month>
        <year>2023</year>
      </pub-date>
      <volume>11</volume>
      <elocation-id>e38266</elocation-id>
      <history>
        <date date-type="received">
          <day>25</day>
          <month>3</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>12</day>
          <month>6</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>30</day>
          <month>8</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>18</day>
          <month>9</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Pulakesh Upadhyaya, Kai Zhang, Can Li, Xiaoqian Jiang, Yejin Kim. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 17.01.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2023/1/e38266" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Causal structure learning refers to a process of identifying causal structures from observational data, and it can have multiple applications in biomedicine and health care.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This paper provides a practical review and tutorial on scalable causal structure learning models with examples of real-world data to help health care audiences understand and apply them.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We reviewed traditional (combinatorial and score-based) methods for causal structure discovery and machine learning–based schemes. Various traditional approaches have been studied to tackle this problem, the most important among these being the <italic>Peter</italic> Spirtes and <italic>Clark</italic> Glymour algorithms. This was followed by analyzing the literature on score-based methods, which are computationally faster. Owing to the continuous constraint on acyclicity, there are new deep learning approaches to the problem in addition to traditional and score-based methods. Such methods can also offer scalability, particularly when there is a large amount of data involving multiple variables. Using our own evaluation metrics and experiments on linear, nonlinear, and benchmark Sachs data, we aimed to highlight the various advantages and disadvantages associated with these methods for the health care community. We also highlighted recent developments in biomedicine where causal structure learning can be applied to discover structures such as gene networks, brain connectivity networks, and those in cancer epidemiology.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We also compared the performance of traditional and machine learning–based algorithms for causal discovery over some benchmark data sets. Directed Acyclic Graph-Graph Neural Network has the lowest structural hamming distance (19) and false positive rate (0.13) based on the Sachs data set, whereas Greedy Equivalence Search and Max-Min Hill Climbing have the best false discovery rate (0.68) and true positive rate (0.56), respectively.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Machine learning–based approaches, including deep learning, have many advantages over traditional approaches, such as scalability, including a greater number of variables, and potentially being applied in a wide range of biomedical applications, such as genetics, if sufficient data are available. Furthermore, these models are more flexible than traditional models and are poised to positively affect many applications in the future.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>causal inference</kwd>
        <kwd>causal structure discovery</kwd>
        <kwd>deep learning</kwd>
        <kwd>biomedicine</kwd>
        <kwd>networks</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Many applications in biomedicine require the knowledge of the underlying causal relationship between various factors beyond association or correlation. Randomized controlled trials are widely used to uncover causality, but these experiments can be prohibitively expensive or unethical in many cases. Therefore, it has sparked an enormous amount of interest in identifying causal effects from observational data [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>].</p>
        <p>In this paper, we discuss causal structure learning; that is, learning causal relationships that are represented as directed graph structures between different factors and its application to biomedicine. The causal structure is represented by a causal graph (also called a causal Bayesian network), which is a directed acyclic graph (DAG), in which the nodes represent variables and edges represent causation (<xref rid="figure1" ref-type="fig">Figure 1</xref>). An edge is drawn from a variable that represents the cause to a variable that represents the effect of that cause. Based on a variety of methodologies, causal structure learning identifies which causal models represented by DAGs accurately represent the observed data.</p>
        <p>For example, consider the example of a gene regulatory network [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>], which is an abstract representation of the gene regulation processes as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. By observing the data of multiple variables such as gene expression profiles, causal structure learning attempts to discover causal relationships among the genes. For example, if a gene A regulates another gene B, it is represented by an arrow between gene A and gene B.</p>
        <p>Many researchers in the biomedical field are interested in causality and not just correlation (eg, whether a particular treatment affects a particular outcome). Unlike association- or correlation-based studies that simply indicate that any 2 variables are correlated, this approach seeks to determine the directional relationship between any 2 variables (eg, between a treatment variable and an outcome variable). In biomedicine, causal structure learning can be applied in a variety of applications.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Example of the causal structure. (A) A gene regulatory network is an abstracted structure (given by the directed graph on the right) of the complex biophysical process shown on the left. (B) A gene regulatory structure from the transmiR database for mice [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e38266_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Examples</title>
        <sec>
          <title>Gene Regulatory Networks</title>
          <p>A gene regulatory network is a network in which molecular regulators and genes are the nodes, and the directed edges denote the interactions among them [<xref ref-type="bibr" rid="ref5">5</xref>]. This is in contrast to association-based methods such as finding correlation or mutual information among the genes (finding Pearson, Kendall, Spearman correlation coefficients, etc) that do not have any directional information [<xref ref-type="bibr" rid="ref8">8</xref>]. Such methods can only be accurate to a certain extent when it comes to deducing extensive gene regulatory structures from data sets with a large set of observations. Correlational studies can only indicate gene-gene association and not the direction of regulation. A gene regulatory network is an example of a causal structure that can be used to develop interventions to control gene expression.</p>
          <p>Causal structure learning algorithms have been used to jointly deduce the phenotype network structure and directional genetic architecture [<xref ref-type="bibr" rid="ref9">9</xref>]. It uses a difference causal inference method and compares it with another causal structure learning algorithm (difference-based Greedy Equivalence Search [GES]) as a baseline. Another study proposed a hybrid algorithm that combines Simulated Annealing with Greedy Algorithm to predict intergene transcriptional regulatory relationships [<xref ref-type="bibr" rid="ref10">10</xref>], which are also directional in nature. In cancer, somatic genome alterations and differentially expressed genes have causal relationships. A correlational study cannot provide directional information in any of these applications.</p>
          <p>The tumor-specific causal inference algorithm proposed by Xue et al [<xref ref-type="bibr" rid="ref11">11</xref>] uses a Bayesian causal learning framework to find those relationships. Unlike association-based studies, this study is based on a causal structure learning framework across the whole genome where Ha et al [<xref ref-type="bibr" rid="ref12">12</xref>] found gene signatures that were the causes of clinical outcomes and were not merely correlated to them. Apart from these examples, there are also networks such as those represented in the Sachs data set [<xref ref-type="bibr" rid="ref13">13</xref>] that simultaneously incorporates measurements of 11 phosphorylated proteins and phospholipids to find causal pathways linking them. This is different from association-based correlation studies because protein signaling pathways are directional.</p>
          <p>In our comparative analysis of the performance of this data set, we found that machine learning models can also be effective at finding causal structures (details are available in the <italic>Results</italic> section). In the case of more complicated protein signaling networks with many nodes, machine learning–based methods might be particularly effective.</p>
        </sec>
        <sec>
          <title>Brain Connectivity Networks</title>
          <p>Different regions of the brain have distinct functions. Previous studies have used correlation-based methods [<xref ref-type="bibr" rid="ref14">14</xref>] to find nondirectional functional connectivity among cortical regions. Spatial localization of brain functions has been studied using methods such as functional magnetic resonance imaging [<xref ref-type="bibr" rid="ref15">15</xref>]. Regions within the brain are the nodes, and a directed edge between regions represents some functional connection (see <xref rid="figure2" ref-type="fig">Figure 2</xref> in the paper by Brovelli et al [<xref ref-type="bibr" rid="ref16">16</xref>] for the difference between coherence and causality graphs). Such connections are directional, can have different strengths (weights), and can be both inhibitory or excitatory [<xref ref-type="bibr" rid="ref17">17</xref>]. Scalable causal structure learning models can also model such connection strengths in addition to directionality, which makes them more expressive than an association. In addition, brains have large-scale structural cortical networks that are directional with respect to information flow and can only be captured by causal structure instead of correlation.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Overview of the methods reviewed and benchmark results sections of the paper. CGNN: Causal Generative Neural Networks; DAG-GNN: Directed Acyclic Graph-Graph Neural Network; FCI: Fast Causal Interface; GAE: graph autoencoder; GES: Greedy Equivalence Search; GRAN-DAG: Gradient-based neural-directed acyclic graph learning; IC: inductive causation; LiNGAM: linear non-Gaussian acyclic model; MMHC: Max-Min Hill Climbing; PC: Peter Spirtes and Clark Glymour; RL-BIC: Reinforcement Learning-Bayesian Information Criterion; SAM: Structural Agnostic Modeling.</p>
            </caption>
            <graphic xlink:href="medinform_v11i1e38266_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Epidemiology</title>
          <p>Causal structure learning has also been used in epidemiology with patients’ medical records. Many complex diseases are multifactorial in which a combination of these factors contributes to disease predisposition. Causal structure learning considers multiple confounders to determine causal effects solely from one factor of interest to another. For example, causal structure has been used to disentangle psychological factors that predispose adolescents to smartphone addiction [<xref ref-type="bibr" rid="ref18">18</xref>]. Incorporating a large set of medical claim records, a recent study used a scalable causal structure learning to elucidate the clinical pathways from comorbid illnesses to Alzheimer disease [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Challenges</title>
        <p>However, there are a few challenges. The general approach to solving this problem of learning a DAG from data, which has been studied for a long time [<xref ref-type="bibr" rid="ref20">20</xref>], has a time complexity that scales exponentially with the number of observed variables. This is because the problem is generally nondeterministic polynomial-time complete [<xref ref-type="bibr" rid="ref21">21</xref>]. In practice, if the number of variables is greater than a few hundred, the problem becomes intractable to solve optimally.</p>
        <p>Several approaches have been used to solve this problem of intractable time complexity. Traditionally, constraint-based and score-based methods, which search for the optimal graph from a discrete space of candidate graphs, have been used to learn the DAG from data. Constraint-based methods such as the <italic>Peter</italic> Spirtes and <italic>Clark</italic> Glymour (PC) and Fast Causal Inference (FCI) algorithms (which will be discussed in detail in the <italic>Discrete Space Algorithms </italic> section) rely on statistical tests to estimate the correct causal structure. However, biological data usually involve hundreds to thousands of variables, and the complexity of algorithms increases exponentially as the number of variables increases. For example, typical human RNA sequence data contain at least 20,000 genes. Therefore, the complexity of the PC algorithm is proportional to 2<sup>20000</sup>, which is infeasible within a reasonable amount of time.</p>
        <p>Hence, researchers have investigated various score-based methods that assign scores based on the data to each DAG and select the one with the best score. Although score-based methods scale better than constraint-based methods, they do not scale well for several thousand variables. On the other hand, patient medical records in electronic health records or claim data raise severe scalability concerns, because they include up to 144,000 International Classification of Diseases-Ninth Revision or 69,823 International Classification of Diseases-Tenth Revision diagnosis codes, &#62;2000 US Food and Drug Administration–approved drugs, and &#62;10,000 Current Procedural Terminology procedures or laboratory test codes.</p>
        <p>To overcome the limited scalability of traditional methods, recent advances in machine learning algorithms have relaxed the problem of finding an optimal DAG into a continuous optimization problem with smooth acyclicity constraints. This enables the use of nonheuristic machine learning (including deep learning) algorithms to determine the optimal causal structure. This is a promising development in the field of biomedicine. In this study, we focus on scalable algorithms. <xref ref-type="table" rid="table1">Table 1</xref> summarizes the algorithms discussed in this study. The tools available for some of these algorithms are listed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. A list of ground truth causal structures can be found in the <italic>bnlearn</italic> repository [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>There are 2 distinct approaches in the context of treatment effect evaluation: the structural approach and potential outcome framework approach [<xref ref-type="bibr" rid="ref23">23</xref>]. In this study, we consider the first approach, in which there are 2 distinct types of algorithms for finding the causal DAG structure. In all of these examples, the goal is to learn a DAG that shows the directional relationship among variables from observational data.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Summary of various algorithms for causal structure learning.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="60"/>
            <col width="60"/>
            <col width="340"/>
            <col width="260"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td>Algorithm</td>
                <td>DS<sup>a</sup></td>
                <td>CS<sup>b</sup></td>
                <td>Summary</td>
                <td>Remarks</td>
                <td>Scalability</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>PC<sup>c</sup></td>
                <td>✓</td>
                <td>✖</td>
                <td>A partially directed acyclic graph (CPDAG<sup>d</sup>) is produced by iteratively checking the conditional independence conditions of adjacent nodes, conditioned on an all-size subset of neighbors.</td>
                <td>Outputs usually converge to the same equivalence class; high FPR<sup>e</sup> on experimental data</td>
                <td>+<sup>f</sup></td>
              </tr>
              <tr valign="top">
                <td>IC<sup>g</sup></td>
                <td>✓</td>
                <td>✖</td>
                <td>Returns the equivalent class of the DAG<sup>h</sup> based on the estimated probability distribution of random variables and an underlying DAG structure.</td>
                <td>Outputs usually converge to the same equivalence class.</td>
                <td>+</td>
              </tr>
              <tr valign="top">
                <td>FCI<sup>i</sup></td>
                <td>✓</td>
                <td>✖</td>
                <td>Modified PC algorithm to detect unknown confounding variables and produces asymptotically correct results.</td>
                <td>Faster than PC with similar TPR<sup>j</sup>; converges to the same asymptotic result; high experimental FPR</td>
                <td>++</td>
              </tr>
              <tr valign="top">
                <td>GES<sup>k</sup></td>
                <td>✓</td>
                <td>✖</td>
                <td>Starts with an empty graph and iteratively adds and deletes edges in the graph by optimizing a score function.</td>
                <td>Faster than PC with higher TPR; stable result for the same score function</td>
                <td>++</td>
              </tr>
              <tr valign="top">
                <td>Fast GES</td>
                <td>✓</td>
                <td>✖</td>
                <td>Improved and parallelized version of GES</td>
                <td>Faster than GES; same TPR; stable result for the same score function</td>
                <td>++</td>
              </tr>
              <tr valign="top">
                <td>K2</td>
                <td>✓</td>
                <td>✖</td>
                <td>Performs a greedy heuristic search for each nodes’ parents.</td>
                <td>Greedy searches might return very suboptimal solutions.</td>
                <td>++</td>
              </tr>
              <tr valign="top">
                <td>MMHC<sup>l</sup></td>
                <td>✓</td>
                <td>✖</td>
                <td>MMHC to find the skeleton of the network and constrained greedy search for edge orientation.</td>
                <td>Greedy searches might return suboptimal solutions.</td>
                <td>+</td>
              </tr>
              <tr valign="top">
                <td>LiNGAM<sup>m</sup></td>
                <td>✓</td>
                <td>✖</td>
                <td>Transfer the linear structure model <inline-graphic xlink:href="medinform_v11i1e38266_fig4.png" xlink:type="simple" mimetype="image"/>
 to the form of <inline-graphic xlink:href="medinform_v11i1e38266_fig5.png" xlink:type="simple" mimetype="image"/>, and optimize for matrix B.</td>
                <td>Works very well on linear data but not on nonlinear data.</td>
                <td>++</td>
              </tr>
              <tr valign="top">
                <td>NOTEARS</td>
                <td>✖</td>
                <td>✓</td>
                <td>Uses smooth function <italic>h</italic>(<italic>A</italic>), whose value characterizes the “DAG-ness” of the graph with adjacency matrix A—that is, <italic>h</italic>(<italic>A</italic>)=0 for DAG—and optimizes using continuous optimization.</td>
                <td>Might converge to many different DAGs; GPUs<sup>n</sup> can speed up the process.</td>
                <td>+++</td>
              </tr>
              <tr valign="top">
                <td>NOBEARS</td>
                <td>✖</td>
                <td>✓</td>
                <td>Proposed a new acyclicity constraint that allows for faster optimization and scalability, and a polynomial regression loss to infer gene regulatory networks from nonlinear gene expressions.</td>
                <td>Might converge to many different DAGs; GPUs can speed up the process.</td>
                <td>+++</td>
              </tr>
              <tr valign="top">
                <td>DAG-GNN<sup>o</sup></td>
                <td>✖</td>
                <td>✓</td>
                <td>Uses an autoencoder framework and deep learning to train it and infer the causal structure from the weights of the trained network and is more scalable than NOTEARS.</td>
                <td>Might converge to many different DAGs; GPUs can speed up the process.</td>
                <td>++++</td>
              </tr>
              <tr valign="top">
                <td>NOFEARS</td>
                <td>✖</td>
                <td>✓</td>
                <td>Modify NOTEARS so the scoring function remains convex to ensure local minima.</td>
                <td>Might converge to many different DAGs; GPUs can speed up the process.</td>
                <td>++++</td>
              </tr>
              <tr valign="top">
                <td>GAE<sup>p</sup></td>
                <td>✖</td>
                <td>✓</td>
                <td>Scalable graph autoencoder framework (GAE) whose training time increases linearly with the number of variable nodes.</td>
                <td>Good accuracy; might converge to many different DAGs; GPUs can speed up the process.</td>
                <td>++++</td>
              </tr>
              <tr valign="top">
                <td>GRAN-DAG<sup>q</sup></td>
                <td>✖</td>
                <td>✓</td>
                <td>Extends the NOTEARS algorithm for nonlinear relationships.</td>
                <td>Works on nonlinear data; better accuracy than NOTEARS; might converge to many different DAGs; GPUs can speed up the process.</td>
                <td>++++</td>
              </tr>
              <tr valign="top">
                <td>CGNN<sup>r</sup></td>
                <td>✖</td>
                <td>✓</td>
                <td>Generative model of the joint distribution of variables reducing MMD<sup>s</sup> between the graph and data.</td>
                <td>Does not always converge to a single class of equivalent DAGs; GPUs can speed up the process.</td>
                <td>++++</td>
              </tr>
              <tr valign="top">
                <td>SAM<sup>t</sup></td>
                <td>✖</td>
                <td>✓</td>
                <td>Structurally agnostic model for causal discovery and penalized adversarial learning.</td>
                <td>Does not always converge to a single class of equivalent DAGs; GPUs can speed up the process.</td>
                <td>++++</td>
              </tr>
              <tr valign="top">
                <td>RL-BIC<sup>u</sup></td>
                <td>✖</td>
                <td>✓</td>
                <td>Reinforcement learning-based algorithm that uses both the acyclicity constraint and the BIC<sup>v</sup> score.</td>
                <td>Very good accuracy; does not always converge to a single class of equivalent DAGs; GPUs can speed up the process.</td>
                <td>++++</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>DS: discrete space algorithms.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>CS: continuous space algorithms.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>PC: <italic>Peter</italic> Spirtes and <italic>Clark</italic> Glymour.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>CPDAG: completed partially directed acyclic graph.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>FPR: false positive rate.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>The + symbol for an algorithm indicates its scalability.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>IC: inductive causation.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>DAG: directed acyclic graph.</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>FCI: Fast Causal Inference.</p>
            </fn>
            <fn id="table1fn10">
              <p><sup>j</sup>TPR: true positive rate.</p>
            </fn>
            <fn id="table1fn11">
              <p><sup>k</sup>GES: Greedy Equivalence Search.</p>
            </fn>
            <fn id="table1fn12">
              <p><sup>l</sup>MMHC: Max-Min Hill Climbing.</p>
            </fn>
            <fn id="table1fn13">
              <p><sup>m</sup>LiNGAM: linear non-Gaussian acyclic model.</p>
            </fn>
            <fn id="table1fn14">
              <p><sup>n</sup>GPUs: graphical processing units.</p>
            </fn>
            <fn id="table1fn15">
              <p><sup>o</sup>DAG-GNN: Directed Acyclic Graph-Graph Neural Network.</p>
            </fn>
            <fn id="table1fn16">
              <p><sup>p</sup>GAE: graph autoencoder.</p>
            </fn>
            <fn id="table1fn17">
              <p><sup>q</sup>GRAN-DAG: Gradient-based neural - directed acyclic graph learning.</p>
            </fn>
            <fn id="table1fn18">
              <p><sup>r</sup>CGNN: causal generative neural network.</p>
            </fn>
            <fn id="table1fn19">
              <p><sup>s</sup>MMD: maximum mean discrepancy.</p>
            </fn>
            <fn id="table1fn20">
              <p><sup>t</sup>SAM: Structural Agnostic Modeling.</p>
            </fn>
            <fn id="table1fn21">
              <p><sup>u</sup>RL-BIC: Reinforcement Learning-Bayesian Information Criterion.</p>
            </fn>
            <fn id="table1fn22">
              <p><sup>v</sup>BIC: Bayesian information criterion.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Paper Structure</title>
        <p>This study attempts to provide a comparative study of various scalable algorithms that are used to discover causal structures from observational data to the biomedicine community. Some of these traditional and score-based methods have been extensively studied [<xref ref-type="bibr" rid="ref24">24</xref>], but many of the algorithms discussed here focus on scalable causal structure learning. Although we do not list all possible approaches as Vowels et al [<xref ref-type="bibr" rid="ref25">25</xref>], we sample a few important algorithms and evaluate their performance on synthetic data sets and the Sachs data set [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
        <p>This tutorial paper presents algorithms for causal structure identification in biomedical informatics. In the <italic>Methods</italic> section, we discuss the methodology and examine the traditional algorithms that determine the optimal causal graph in a discrete space. We also discuss algorithms that use continuous space optimization to discover causal relationships. We compare the performance of these algorithms in the <italic>Results</italic> section. Finally, we present the discussion and conclusions. A brief overview of the methods and results is presented in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>In this section, we discuss 2 paradigms of algorithms for causal structure learning. First, we consider algorithms that search for the optimal DAG in the discrete space of all possible DAGs (space of all possible discrete DAGs for a given number of variable nodes) or <italic>discrete space algorithms</italic>. Second, we consider scalable algorithms that use continuous optimization methods to find the optimal DAG (ie, algorithms that search the continuous space of all possible weighted DAGs to find the optimal one), known as <italic>continuous space algorithms</italic>.</p>
      </sec>
      <sec>
        <title>Discrete Space Algorithms</title>
        <sec>
          <title>Overview</title>
          <p>The first type that we discuss in this section is discrete space algorithms for causal discovery; that is, algorithms that search for the optimal DAG in the discrete space of candidate causal graphs. This is in contrast to continuous space algorithms (discussed in the <italic>Continuous Space Algorithms</italic> section) that search for the optimal DAG from the continuous space of weighted candidate graphs.</p>
          <p>The discrete space algorithms can be divided into the following 4 types: combinatorial constraint-based models, score-based models, hybrid models, and functional models. In combinatorial constrained-based methods, we consider methods that check the conditional independence relations of 2 adjacent nodes conditioned on all subsets of their neighbors. Such methods can be useful when the number of variables is up to a few hundred. Score-based methods perform optimization by considering a score representing the goodness of fit and can handle more variables than constraint-based methods. Hybrid methods combine constraint- and score-based algorithms. Functional models find structural equations to describe the causal relationship and are useful mostly when the variables can be assumed to be expressed by some linear or nonlinear equations.</p>
        </sec>
        <sec>
          <title>Combinatorial Methods</title>
          <p>We now focus on combinatorial optimization methods, where conditional independence relationships in the data are used for finding the optimal DAG.</p>
          <sec>
            <title>PC and Its Variants</title>
            <p>The PC algorithm was proposed by <italic>PC</italic> and is named after them [<xref ref-type="bibr" rid="ref26">26</xref>]. This algorithm produces a completed partially DAG (CPDAG) by iteratively checking the conditional independence relations of 2 adjacent nodes conditioned on all-size subsets of their neighbors. Three assumptions underlie the algorithm: no confounder variable, the causal Markov condition, and faithfulness. Under these conditions, this algorithm generates a partially directed causal graph that is proven to be asymptotically correct.</p>
            <p>The PC algorithm is order-dependent; that is, the output of the algorithm can depend on the order in which the variables are provided to the algorithm. To address this problem, Colombo and Maathuis [<xref ref-type="bibr" rid="ref27">27</xref>] developed a PC-stable algorithm in which the deletion of an edge takes place at the end of each stage (considering any 2 nodes’ relations within a predetermined neighborhood). Thus, any ordering of vertices will result in the same edge deletions, resulting in the same stable output. The PCMCI and PCMCI+ [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>] are 2 extensions of the PC algorithm proposed to handle large-scale time-series data sets.</p>
          </sec>
          <sec>
            <title>Inductive Causation Algorithm and Its Variants</title>
            <p>The inductive causation (IC) algorithm uses the estimated probability distribution of random variables with an underlying DAG structure and outputs the equivalent class of the DAG. In contrast, PC provides a schematic search method and is thus considered a refinement of the IC.</p>
            <p>The IC* algorithm [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>] is an extension of the IC algorithm, which searches for causal relations using observations of a set of variables, even when they appear as latent variables. The output of the IC algorithm is a CPDAG that only has directed edges (identified causation) and undirected edges (undetermined causation). The output of the IC* algorithm is an embedded pattern; that is, a hybrid graph containing ≥2 types of edges.</p>
          </sec>
          <sec>
            <title>FCI and Its Variants</title>
            <p>The FCI is a modification of the PC algorithm [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref32">32</xref>] that detects unknown confounding variables and produces asymptotically correct results. FCI improves the PC algorithm by adopting 2 rounds of phases of the PC algorithm. The algorithm first uses PC-phase I to find an initial skeleton, then uses the separation set to orient all v-structure triples (a-&#62;c&#60;-b) and outputs a CPDAG; then performs another round of skeleton searching based on the CPDAG and repeats the orientation for unshielded triples. The really fast causal inference algorithm [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>] skips the second step, which is the most time-consuming part of the task, and therefore significantly accelerates the FCI procedure. A set of 10 rules was added to the algorithm to orient the edges of the skeleton.</p>
          </sec>
        </sec>
        <sec>
          <title>Score-Based Methods</title>
          <sec>
            <title>Overview</title>
            <p>In addition to traditional combinatorial methods such as PC and FCI, score-based methods have also been used to uncover causal structures. In these methods, algorithms determine the optimal DAG by optimizing a particular score.</p>
            <p>A typical score function is the Bayesian information criterion (BIC) score. The GES algorithm uses different score functions for different data types as follows: the BIC score (for continuous data), likelihood-equivalence Bayesian Dirichlet uniform joint distribution score (for discrete data), and Conditional Gaussian score (for continuous or discrete mixture data).</p>
            <disp-formula>
              <graphic xlink:href="medinform_v11i1e38266_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>where <inline-graphic xlink:href="medinform_v11i1e38266_fig7.png" xlink:type="simple" mimetype="image"/>
 is the maximized likelihood function of the model, <italic>n</italic> is the number of observational data points, and <italic>k</italic> is the degree of freedom. The definition of the Bayesian Dirichlet uniform joint distribution scoring function was found in a study by Buntine [<xref ref-type="bibr" rid="ref35">35</xref>]. The conditional Gaussian score is defined on the ratios of joint distributions, and Andrews et al [<xref ref-type="bibr" rid="ref36">36</xref>] have proved that the Conditional Gaussian score is <italic>score equivalent</italic>; that is, a scoring function that scores all DAGs in the same Markov Equivalence Class equally.</p>
            <p>Score-based methods include the GES algorithm, the fast GES algorithm, and the K2 algorithm.</p>
          </sec>
          <sec>
            <title>GES Algorithm</title>
            <p>The GES algorithm was proposed by Chickering [<xref ref-type="bibr" rid="ref37">37</xref>], and its underlying principles were obtained from Meek [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. The algorithm starts with an empty graph and iteratively adds and deletes the edges in the graph by optimizing a score function. During the forward phase, the algorithm searches iteratively from the space of the DAGs created by one edge addition on the current DAG and selects the edge with the best score. The forward phase ends when the score is no longer increasing. In the second phase, the algorithm repeats the above step but deletes one edge at a time and selects the edge that improves the score the most. The algorithm stops as soon as there are no more edges to be deleted.</p>
          </sec>
          <sec>
            <title>Fast GES Algorithm</title>
            <p>Fast GES is an improved and parallelized version of the GES. Significant speedup was achieved by storing the score information during the GES algorithm [<xref ref-type="bibr" rid="ref39">39</xref>]. In addition, several insights regarding parallelization were offered in the paper. First, the precalculation of covariances can be parallelized by variables. Second, it is possible to parallelize the process of calculating the edge scores when an edge addition is being performed on the graph. A greater speedup can be achieved for sparse graphs.</p>
          </sec>
          <sec>
            <title>K2 Algorithm</title>
            <p>The main idea of the K2 algorithm [<xref ref-type="bibr" rid="ref40">40</xref>] is to perform a greedy heuristic search of the parents of each node. For each node, the algorithm iteratively determines the parents. When visiting node <italic>X<sub>i</sub></italic>, the algorithm searches for all possible parents of <italic>X<sub>i</sub></italic> (<italic>X<sub>j</sub></italic> such that <italic>j</italic> has a lower ordering of <italic>i</italic>). The algorithm greedily adds <italic>X<sub>j</sub></italic> to the parent set of <italic>X<sub>i</sub></italic> if it could increase a predefined score function. The iteration for node <italic>X<sub>i</sub></italic> stops when the number of parent nodes reaches the (preset) maximum or when adding an <italic>X<sub>j</sub></italic> does not increase the score anymore. The entire algorithm finishes after completing the iteration for all <italic>X<sub>i</sub></italic>.</p>
          </sec>
        </sec>
        <sec>
          <title>Hybrid Algorithms</title>
          <p>Hybrid algorithms use a combination of score-based and combinatorial constraint-based optimization methods to determine the optimal DAG. An example is the Max-Min Hill Climbing (MMHC) algorithm<bold>.</bold> The MMHC algorithm is a combination of constraint- and score-based algorithms [<xref ref-type="bibr" rid="ref41">41</xref>]. It uses the Max-Min Parents and Children algorithm. A detailed description is provided in [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>] to find the skeleton of the Bayesian network and then perform the constrained greedy search to orient the edges.</p>
        </sec>
        <sec>
          <title>Algorithms for Functional Causal Models</title>
          <sec>
            <title>Overview</title>
            <p>Functional causal models or structural equation models (SEMs) assume structural equations that define the causal relationships. Such structural equations may describe the linear and nonlinear relationships among variables. In addition to the discrete methods discussed here, SEMs are also an important assumption in many machine learning–based methods that use the continuous optimization techniques in the <italic>Continuous Space Algorithms</italic> section.</p>
          </sec>
          <sec>
            <title>Linear Non-Gaussian Acyclic Model</title>
            <p>The linear non-Gaussian acyclic model (LiNGAM) was originally proposed by Shimizu [<xref ref-type="bibr" rid="ref43">43</xref>] to learn linear non-Gaussian acyclic causal graphs from continuous-valued data. The LiNGAM transfers <inline-graphic xlink:href="medinform_v11i1e38266_fig8.png" xlink:type="simple" mimetype="image"/> to the form of <inline-graphic xlink:href="medinform_v11i1e38266_fig9.png" xlink:type="simple" mimetype="image"/>, and the causal structure problem becomes an optimization problem for matrix <italic>B</italic>. There are several extensions of the LiNGAM model using different estimation methods, including independent component analysis–based LiNGAM [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>], DirectLiNGAM [<xref ref-type="bibr" rid="ref45">45</xref>], and Pairwise LiNGAM [<xref ref-type="bibr" rid="ref46">46</xref>].</p>
          </sec>
          <sec>
            <title>Additive Noise Models</title>
            <p>A nonlinear additive noise model is proposed in [<xref ref-type="bibr" rid="ref47">47</xref>]. The model assumes that the observed data are generated according to the following equation:</p>
            <disp-formula>
              <graphic xlink:href="medinform_v11i1e38266_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>
          where <italic>f<sub>i</sub></italic> is an arbitrary function, <italic>x<sub>pa(i)</sub></italic> denotes the ancestor nodes of node <italic>x<sub>i</sub></italic> in the true causal graph, and <italic>n<sub>i</sub></italic> is the noise variable of an arbitrary probability density function. This study proves the basic identifiability principle for the 2 variables case and generalizes the results to multiple variables.</p>
          </sec>
        </sec>
      </sec>
      <sec>
        <title>Continuous Space Algorithms</title>
        <sec>
          <title>Overview</title>
          <p>Traditional causal discovery algorithms attempt to discover a causal graph, which is usually a DAG, while searching for an optimal graph in the space of candidate graphs. The score-based optimization problem of DAG learning (discussed in the <italic>Score-Based Methods</italic> section) is mathematically given by the following equation:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v11i1e38266_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>
           Here, <inline-graphic xlink:href="medinform_v11i1e38266_fig12.png" xlink:type="simple" mimetype="image"/> is the set of all DAGs with <italic>d</italic> nodes, and <inline-graphic xlink:href="medinform_v11i1e38266_fig13.png" xlink:type="simple" mimetype="image"/> is the cost or score function. The problem of searching for all DAGs is usually intractable and superexponential in the number of nodes in the graph.</p>
          <p>An alternative approach would be to model the problem as a continuous space optimization problem, which would then allow the application of various learning techniques. Recently, several publications have explored continuous optimization methods that learn DAGs by adding an acyclicity constraint. In these approaches, the discrete acyclicity constraint <inline-graphic xlink:href="medinform_v11i1e38266_fig14.png" xlink:type="simple" mimetype="image"/> is replaced by <inline-graphic xlink:href="medinform_v11i1e38266_fig15.png" xlink:type="simple" mimetype="image"/>, where <italic>h</italic>(<italic>A</italic>) is a smooth function that ensures acyclicity of <italic>G</italic>(<italic>A</italic>).</p>
          <p>The hard constraints on acyclicity can be relaxed and incorporated into the loss function to be optimized. This smooth continuous constraint allows the use of machine learning–based tools, which in turn can make the algorithms scalable in the presence of substantial amounts of data. These algorithms are based on SEMs.</p>
        </sec>
        <sec>
          <title>NOBEARS Algorithm</title>
          <p>Several other improvements such as the NOBEARS algorithm [<xref ref-type="bibr" rid="ref48">48</xref>] have improved the scalability of the NOTEARS algorithm. A fast approximation of a new constraint is proposed, and a polynomial regression loss model is proposed to account for nonlinearity in gene expression to infer gene regulatory networks.</p>
        </sec>
        <sec>
          <title>NOTEARS Algorithm</title>
          <p>This algorithm considers the acyclicity constraint and comes up with the constraint.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v11i1e38266_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Here, <inline-graphic xlink:href="medinform_v11i1e38266_fig32.png" xlink:type="simple" mimetype="image"/>
 is the element-wise product. <italic>h</italic>(<italic>A</italic>) equals 0 if and only if <italic>G</italic>(<italic>A</italic>) is acyclic, and more severe deviations from acyclicity would increase the value of the function. This study assumes a linear SEM:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v11i1e38266_fig17.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Here, <italic>X<sub>i</sub></italic> is a <italic>d</italic>-dimensional sample vector of the joint distribution of <italic>d</italic> variables and <italic>Z<sub>i</sub></italic> is a <italic>d</italic>-dimensional noise vector. We denote <italic>n</italic> such samples by matrix <italic>X,</italic> and the loss function (with <italic>l</italic><sub>1</sub>-regularization) is given as follows:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v11i1e38266_fig18.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>The constraint is given by <italic>h</italic>(<italic>A</italic>)=0 and is used in the final Lagrangian formulation of the loss function. The paper on learning sparse nonparametric DAGs is an extension of NOTEARS, which tries to define a “surrogate” of the matrix 𝐴 above for general nonparametric models to optimize [<xref ref-type="bibr" rid="ref49">49</xref>].</p>
        </sec>
        <sec>
          <title>Directed Acyclic Graph-Graph Neural Network Algorithm</title>
          <p>A Directed Acyclic Graph-Graph Neural Network (DAG-GNN) [<xref ref-type="bibr" rid="ref50">50</xref>] generalizes the NOTEARS algorithm by considering the nonlinearity in the SEMs. It can be modeled with a variational autoencoder neural network with a special structure, with an encoder<inline-graphic xlink:href="medinform_v11i1e38266_fig19.png" xlink:type="simple" mimetype="image"/>, and a decoder <inline-graphic xlink:href="medinform_v11i1e38266_fig20.png" xlink:type="simple" mimetype="image"/> and where <italic>g</italic><sub>1</sub>,<italic>g</italic><sub>2</sub> are parameterized functions that can be assumed to serve as the inverse of <italic>f</italic><sub>1</sub>,<italic>f</italic><sub>2</sub>, respectively.</p>
          <p>This variational framework considers <italic>Z</italic> to be a latent vector (instead of viewing it as noise in linear SEMs), which can have dimensions other than <italic>d</italic>. The decoder then attempts to reconstruct the data from this latent variable. The encoder and decoder can be trained together from <italic>n</italic> samples of <inline-graphic xlink:href="medinform_v11i1e38266_fig21.png" xlink:type="simple" mimetype="image"/> such that the loss function:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v11i1e38266_fig22.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>is minimized, where <italic>KLD</italic> is the Kullback-Liebler Divergence. The constraint in this optimization process to ensure the acyclicity of matrix <italic>A</italic> is slightly modified to:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v11i1e38266_fig23.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where <italic>α</italic> is an arbitrary parameter. This constraint can be implemented more easily in graphical processing unit-based deep learning libraries owing to the algorithm’s parallelizability and scalability of the algorithm.</p>
        </sec>
        <sec>
          <title>NOFEARS Algorithm</title>
          <p>Wei et al [<xref ref-type="bibr" rid="ref51">51</xref>] demonstrated that the NOTEARS algorithm fails to satisfy the Karush-Kuhn-Tucker regularity conditions. Therefore, they reformulated the problem to ensure that the convexity of the scoring function can still ensure local minima even when the constraints are nonconvex. This new algorithm called the NOFEARS algorithm has the following acyclicity constraint.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v11i1e38266_fig24.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </sec>
        <sec>
          <title>Graph Autoencoder</title>
          <p>Ng et al [<xref ref-type="bibr" rid="ref52">52</xref>] propose another graph autoencoder (GAE) framework for causal structure learning, which improves the training speed and performance over DAG-GNN for both linear and nonlinear synthetic data sets.</p>
          <p>Some other similar machine learning–based continuous learning algorithms include gradient-based neural DAG [<xref ref-type="bibr" rid="ref53">53</xref>], Causal Generative Neural Network [<xref ref-type="bibr" rid="ref54">54</xref>], and structurally agnostic model [<xref ref-type="bibr" rid="ref55">55</xref>].</p>
        </sec>
        <sec>
          <title>Reinforcement Learning-Based Methods</title>
          <p>Reinforcement learning-based methods have been proposed recently that consider both the acyclicity constraint and BIC score in the reward function and attempt to learn the DAG [<xref ref-type="bibr" rid="ref56">56</xref>]. They used an actor-critic model, where the actor is an encoder-decoder framework that takes data as input and outputs the graph. The critic uses the reward function for this graph and updates the proposed graph.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>This section provides the results to compare the effectiveness of some causal structure learning algorithms on synthetic and real data.</p>
      <sec>
        <title>Benchmark Methods</title>
        <p>The synthetic data were generated in the same manner as in the DAG-GNN paper [<xref ref-type="bibr" rid="ref50">50</xref>]. An Erdos-Renyi model with an expected node degree of 3 was used to generate the random graph, and the adjacency matrix was formed by assigning weights to the edges from a uniform distribution. The samples were generated using the following structural equation:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v11i1e38266_fig25.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Here, <italic>Z</italic> is random Gaussian noise. We consider 2 functions for <italic>g</italic>(<italic>X</italic>). The first is a (linear) identity function:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v11i1e38266_fig26.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>and the second is a nonlinear function</p>
        <disp-formula>
          <graphic xlink:href="medinform_v11i1e38266_fig27.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>We considered 5 data sets for both linear and nonlinear functions. For each data set, we generated n=5000 independent samples according to the above equations. We used 6 algorithms, 4 of which are discrete space algorithms. PC and Greedy Fast Causal Interface (GFCI) are constraint-based methods, GES is a score-based method, and MMHC is a hybrid method. We also considered 2 continuous space methods, DAG-GNN and GAE.</p>
        <p>We also evaluated these algorithms on the publicly available Sachs data set [<xref ref-type="bibr" rid="ref13">13</xref>] using the above 4 metrics and showed the results are shown in <xref ref-type="table" rid="table2">Table 2</xref>. For other data sets that have the ground truth but are not covered in our experiments, please refer to the <italic>bnlearn</italic> repository [<xref ref-type="bibr" rid="ref22">22</xref>]. The algorithms were implemented in Python for machine learning–based continuous space methods and R for discrete space algorithms.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Benchmark experiments on the Sachs data set. We evaluated 6 algorithms— Peter Spirtes and Clark Glymour (PC), Greedy Equivalence Search (GES), Greedy Fast Causal Interface (GFCI), Max-Min Hill Climbing (MMHC), Directed Acrylic Graph-Graph Neural Network (DAG-GNN), and graph auto encoder (GAE), on 4 metrics—structural hamming distance (SHD), true positive rate (TPR), false positive rate (FPR), and false discovery rate (FDR)—and show their results in <xref rid="figure3" ref-type="fig">Figure 3</xref>. In all these evaluations, we consider any edge whose direction is reversed as half discovered.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="130"/>
            <col width="130"/>
            <col width="130"/>
            <col width="140"/>
            <col width="180"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Metric</td>
                <td>PC</td>
                <td>GES</td>
                <td>GFCI</td>
                <td>MMHC</td>
                <td>DAG-GNN</td>
                <td>GAE</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>SHD (<italic>↓</italic>)</td>
                <td>24.50</td>
                <td>26.50</td>
                <td>29.50</td>
                <td>22.00</td>
                <td>
                  <italic>19.00</italic>
                  <sup>a</sup>
                </td>
                <td>22.00</td>
              </tr>
              <tr valign="top">
                <td>FDR (<italic>↓</italic>)</td>
                <td>0.77</td>
                <td>0.72</td>
                <td>0.79</td>
                <td>
                  <italic>0.68</italic>
                </td>
                <td>0.71</td>
                <td>0.89</td>
              </tr>
              <tr valign="top">
                <td>TPR (<italic>↑</italic>)</td>
                <td>0.32</td>
                <td>
                  <italic>0.56</italic>
                </td>
                <td>0.44</td>
                <td>0.47</td>
                <td>0.11</td>
                <td>0.05</td>
              </tr>
              <tr valign="top">
                <td>FPR (<italic>↓</italic>)</td>
                <td>0.49</td>
                <td>0.64</td>
                <td>0.72</td>
                <td>0.45</td>
                <td>
                  <italic>0.13</italic>
                </td>
                <td>0.21</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Italicized values represent the best results for each metric.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Accuracy comparison. We evaluated 6 algorithms: Peter Spirtes and Clark Glymour (PC), Greedy Equivalence Search (GES), Greedy Fast Causal Interface (GFCI), Max-Min Hill Climbing (MMHC), Directed Acrylic Graph-Graph Neural Network (DAG-GNN), and graph auto encoder (GAE) on 4 metrics, structural hamming distance (SHD↓), true positive rate (TPR↑), false positive rate (FPR↓), and false discovery rate (FDR↓). In all these evaluations, we considered any bidirectional edges as half discovered. In experiments (A-D), first column, the data are drawn from a distribution according to the underlying causal graph where relationships between nodes are linear, and experiments (E-H), second column, is for the nonlinear case. In all experiments, the number of nodes of the graph ranges from 10, 20, 50, to 100. For each graph size, we drew 5 different data sets from the graph structure with a sample size of 1000 and calculated 4 evaluation metrics and obtained the average.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e38266_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Observations</title>
        <p>All algorithms were tested on both linear and nonlinear data. The accuracies of some of these algorithms are shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. We used the following 4 evaluation measures: structural hamming distance (SHD), true positive rate (TPR), false positive rate (FPR), and false discovery rate (FDR). SHD refers to the number of edge insertions, deletions, and reversals. In our case, we used a modified SHD, where a reversal contributes half of the SHD score instead of 1. The TPR is the ratio of the algorithm’s correctly discovered edges to the number of edges in the ground truth graph. FPR is the ratio of the algorithm’s falsely discovered edges to the number of nonedges in the ground truth graph. FDR is the ratio of the algorithm’s falsely discovered edges to the total number of discovered edges. We also evaluated these algorithms on the Sachs data set using the above 4 metrics, and the results are shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
      </sec>
      <sec>
        <title>Time Complexity</title>
        <p>The relative scalability of different algorithms is presented in <xref ref-type="table" rid="table1">Table 1</xref>. The worst-case time complexity of the PC, GES, and GFCI algorithms was <inline-graphic xlink:href="medinform_v11i1e38266_fig28.png" xlink:type="simple" mimetype="image"/>, where <italic>m</italic> is the number of variables (nodes in the DAG). For the GES, the best-case time complexity was <inline-graphic xlink:href="medinform_v11i1e38266_fig29.png" xlink:type="simple" mimetype="image"/>
. For the GAE and DAG-GNN, the time complexity of the algorithm is <inline-graphic xlink:href="medinform_v11i1e38266_fig30.png" xlink:type="simple" mimetype="image"/>, where <italic>k</italic> is the number of iterations. The time complexity for MMHC is <inline-graphic xlink:href="medinform_v11i1e38266_fig31.png" xlink:type="simple" mimetype="image"/>, where <italic>V</italic> is the set of variables, <italic>S</italic> is the largest set of parents and children, and <italic>l</italic> is a parameter of the algorithm that denotes the size of the largest conditioned subset [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
        <p>In our experiments, we observed that in the worst-case scenario, the running time for a maximum of 100 variables was of the order of hours for MMHC and of the order of minutes for the other algorithms. However, as the number of variables increases to a few thousand, machine learning–based methods such as DAG-GNN and GAE can provide solutions in a reasonable time. The trade-off between complexity (number of iterations) and accuracy can provide a choice between a method that is less accurate but faster or vice versa.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Interpretation of Results</title>
        <p>It is clear from the results that the algorithms have different advantages and disadvantages. Although the PC algorithm performs well across both linear and nonlinear data, it has a low TPR and is computationally intensive. The GES, GFCI, and MMHC algorithms show a very high FPR, but their TPR is higher than that of the PC algorithm. The SHD of the 2 machine learning–based methods—DAG-GNN and GAE—was also considerably lower for both data sets.</p>
        <p>Continuous constraint-based algorithms generally exhibit a very low FDR, except for the benchmark Sachs data set. This is generally because both linear and nonlinear models are based on SEMs with the same causal relationship function at every node, which is an algorithm assumption when they learn the causal structure, but one cannot guarantee the same for Sachs data [<xref ref-type="bibr" rid="ref13">13</xref>], because such constraints cannot be defined a priori.</p>
        <p>This is corroborated by recent results from Zhu et al [<xref ref-type="bibr" rid="ref56">56</xref>] where such gradient-based methods performed poorly on data generated by a nonlinear model, in which every causal relationship (node function) was sampled from a Gaussian distribution. However, this is a growing research area. In general, in areas such as gene regulatory networks and brain connectivity networks where the number of variables is large, machine learning–based methods can provide comparable results to traditional methods with a much more efficient time complexity and scalability.</p>
      </sec>
      <sec>
        <title>Challenges</title>
        <p>Machine learning for causal structure learning is not without its limitations, which may present several challenges. First, in many applications, there is no ground truth about causal structure, which makes it difficult to evaluate the performance of these algorithms. Furthermore, many scalable methods use stochastic gradient descent; thus, the final output graph is not always deterministic. When the number of data samples or variables is low, traditional or score-based methods are a better choice, especially when the application requires fewer false positives. For the PC, GES, and GFCI algorithms, we observed that these algorithms require considerable running time, as the number of variables is more than 100 [<xref ref-type="bibr" rid="ref57">57</xref>].</p>
        <p>However, when it comes to large samples of data (eg, more than 100,000 samples) or hundreds of variables (eg, in many gene networks), machine learning methods can provide a reasonable solution, because other methods fail owing to scalability issues. As machine learning algorithms are highly parallelizable, the solutions can be computed much faster, particularly through the use of a graphical processing unit. These algorithms are potentially useful for many applications related to genetics and biomedicine, especially those with an abundance of observational data.</p>
        <p>The continuous space machine learning models are more scalable and might be useful in the era of big data. Traditional methods might have complexities that grow exponentially with the number of attributes. Despite the nonconvexity of the optimization proposed by Zheng et al [<xref ref-type="bibr" rid="ref58">58</xref>], optimization and learning strategies can be used to help find the optimal solution. Several methods have been used to solve this problem using augmented Lagrangian approaches [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref52">52</xref>].</p>
        <p>The NOBEARS algorithm reduces the computing complexity of NOTEARS from cubic to quadratic in terms of the number of attributes [<xref ref-type="bibr" rid="ref48">48</xref>], allowing for smooth implementation in data sets that have more than 4000 attributes. The algorithms are also highly parallelizable, and most of the algorithms use deep learning libraries such as Tensorflow [<xref ref-type="bibr" rid="ref59">59</xref>] and PyTorch [<xref ref-type="bibr" rid="ref60">60</xref>].</p>
        <p>Machine learning techniques for causal discovery, which use continuous space optimization, are an emerging area of research, which can lead to more efficient causal discovery, particularly in applications where directed graphs are used to specify causal relations more clearly. With sufficient data, machine learning models can be robust to certain discrepancies such as sample bias, missing data, and erroneous measurements. Many of these applications have also focused on weaker concepts of causality such as pairwise directionality during the analysis of gene networks and brain connectivity networks [<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>].</p>
        <p>It is noteworthy that machine learning methods are usually black box methods, which might provide lesser insight into the process of derivability of the causal structures. For higher interpretability, an option that has been explored is to develop parallel versions of these algorithms, such as PC [<xref ref-type="bibr" rid="ref63">63</xref>]. In the future, options such as ensemble learning can be explored for the same.</p>
        <p>Some other challenges can be found in finding causal structure from data. In the case of learning causal structure from electronic health record data, they might have several problems, such as missing values or noise in the data, which are very common [<xref ref-type="bibr" rid="ref64">64</xref>]. If the number of missing values or the amount of noise is significant, the application of causal discovery methods might yield unreliable results.</p>
        <p>Furthermore, most causal discovery methods assume that the distribution of data is stationary, which may not be true in certain medical applications [<xref ref-type="bibr" rid="ref65">65</xref>]. Hence, it is very important to consider the aforementioned problems as well as issues related to selection bias before causal structure learning methods are applied. Glymour et al [<xref ref-type="bibr" rid="ref24">24</xref>] discuss some general guidelines to avoid such problems in causal structure learning. These generalized learning algorithms are ineffective in many biomedical applications, such as in learning biological or gene networks, because they do not consider specific network constraints. These constraints can be incorporated into causal structure learning methods for greater efficiency.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this paper, we have discussed the motivation for causal structure discovery in biomedicine as well as some interesting applications. Two paradigms of causal discovery algorithms have been reviewed. Combinatorial or score-based algorithms are used in the first paradigm for optimizing discrete spaces of candidate causal graphs, whereas machine learning algorithms are used in the second paradigm to solve continuous optimization problems with acyclicity constraints. In addition to listing these methods, we have also included resources that readers can use to find appropriate applications. Furthermore, we tested several algorithms against synthetic benchmark data sets and against the Sachs real-world data set and evaluated their relative performances. We have also discussed their theoretical time complexity. Our discussion of the limitations and challenges of various algorithms is intended to offer readers a guide for choosing an algorithm from among the many available options. Finally, we highlight several challenges associated with finding causal structure from real-world data (eg, missing values, nonstationarity, noise, and sampling bias).</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>List of tools for causal discovery.</p>
        <media xlink:href="medinform_v11i1e38266_app1.docx" xlink:title="DOCX File , 21 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CPDAG</term>
          <def>
            <p>completed partially directed acyclic graph</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">DAG</term>
          <def>
            <p>directed acyclic graph</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DAG-GNN</term>
          <def>
            <p>Directed Acyclic Graph-Graph Neural Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">FCI</term>
          <def>
            <p>Fast Causal Inference</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">FDR</term>
          <def>
            <p>false discovery rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">FPR</term>
          <def>
            <p>false positive rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">GAE</term>
          <def>
            <p>graph autoencoder</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">GES</term>
          <def>
            <p>Greedy Equivalence Search</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">GFCI</term>
          <def>
            <p>Greedy Fast Causal Interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">IC</term>
          <def>
            <p>inductive causation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">LiNGAM</term>
          <def>
            <p>linear non-Gaussian acyclic model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">MMHC</term>
          <def>
            <p>Max-Min Hill Climbing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PC</term>
          <def>
            <p>Peter Spirtes and Clark Glymour</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SEM</term>
          <def>
            <p>structural equation model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SHD</term>
          <def>
            <p>structural hamming distance</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">TPR</term>
          <def>
            <p>true positive rate</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>XJ is a Cancer Prevention and Research Institute of Texas scholar in cancer research (RR180012) and was supported in part by the Christopher Sarofim Family Professorship, University of Texas Stars award, University of Texas Health Science Center startup, and the National Institutes of Health under award numbers R01AG066749 and U01TR002062.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>The survey and experiments on deep learning–based methods and the survey on potential applications were conducted by PU. The survey and experiments on traditional methods were conducted by KZ and CL. XJ and YK conceived the study and provided useful inputs for the potential applications of scalable structure learning.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spirtes</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Glymour</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Scheines</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kauffman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aimale</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Wimberly</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Constructing Bayesian network models of gene expression networks from microarray data</article-title>
          <source>Carnegie Mellon University</source>
          <year>2000</year>
          <access-date>2021-01-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://kilthub.cmu.edu/articles/journal_contribution/Constructing_Bayesian_Network_Models_of_Gene_Expression_Networks_from_Microarray_Data/6491291">https://kilthub.cmu.edu/articles/journal_contribution/Constructing_Bayesian_Network_Models_of_Gene_Expression_Networks_from_Microarray_Data/6491291</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alpert</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>RF</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>The causal relationship between portal usage and self-efficacious health information-seeking behaviors: secondary analysis of the health information national trends survey data</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <month>01</month>
          <day>27</day>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>e17782</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/1/e17782/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17782</pub-id>
          <pub-id pub-id-type="medline">33502334</pub-id>
          <pub-id pub-id-type="pii">v23i1e17782</pub-id>
          <pub-id pub-id-type="pmcid">PMC7875689</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Che</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>A bayesian network analysis of the probabilistic relationships between various obesity phenotypes and cardiovascular disease risk in Chinese adults: Chinese population-based observational study</article-title>
          <source>JMIR Med Inform</source>
          <year>2022</year>
          <month>03</month>
          <day>02</day>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>e33026</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2022/3/e33026/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/33026</pub-id>
          <pub-id pub-id-type="medline">35234651</pub-id>
          <pub-id pub-id-type="pii">v10i3e33026</pub-id>
          <pub-id pub-id-type="pmcid">PMC8928047</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <article-title>TransmiR v2.0 database</article-title>
          <source>The Cui Lab</source>
          <access-date>2021-10-14</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cuilab.cn/transmir">http://www.cuilab.cn/transmir</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Belyaeva</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Squires</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Uhler</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>DCI: learning causal differences between gene regulatory networks</article-title>
          <source>Bioinformatics</source>
          <year>2021</year>
          <month>03</month>
          <day>11</day>
          <volume>37</volume>
          <issue>18</issue>
          <fpage>3067</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btab167</pub-id>
          <pub-id pub-id-type="medline">33704425</pub-id>
          <pub-id pub-id-type="pii">6168117</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huynh-Thu</surname>
              <given-names>VA</given-names>
            </name>
            <name name-style="western">
              <surname>Sanguinetti</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Gene regulatory network inference: an introductory survey</article-title>
          <source>Methods Mol Biol</source>
          <year>2019</year>
          <volume>1883</volume>
          <fpage>1</fpage>
          <lpage>23</lpage>
          <pub-id pub-id-type="doi">10.1007/978-1-4939-8882-2_1</pub-id>
          <pub-id pub-id-type="medline">30547394</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Miao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>RegNetwork: an integrated database of transcriptional and post-transcriptional regulatory networks in human and mouse</article-title>
          <source>Database (Oxford)</source>
          <year>2015</year>
          <volume>2015</volume>
          <fpage>bav095</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/database/article-lookup/doi/10.1093/database/bav095"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/bav095</pub-id>
          <pub-id pub-id-type="medline">26424082</pub-id>
          <pub-id pub-id-type="pii">bav095</pub-id>
          <pub-id pub-id-type="pmcid">PMC4589691</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Quantifying gene regulatory relationships with association measures: a comparative study</article-title>
          <source>Front Genet</source>
          <year>2017</year>
          <volume>8</volume>
          <fpage>96</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3389/fgene.2017.00096"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fgene.2017.00096</pub-id>
          <pub-id pub-id-type="medline">28751908</pub-id>
          <pub-id pub-id-type="pmcid">PMC5507966</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Neto</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Keller</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Attie</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Yandell</surname>
              <given-names>BS</given-names>
            </name>
          </person-group>
          <article-title>Causal graphical models in systems genetics: a unified framework for joint inference of causal network and genetic architecture for correlated phenotypes</article-title>
          <source>Ann Appl Stat</source>
          <year>2010</year>
          <month>03</month>
          <day>01</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>320</fpage>
          <lpage>39</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/21218138"/>
          </comment>
          <pub-id pub-id-type="doi">10.1214/09-aoas288</pub-id>
          <pub-id pub-id-type="medline">21218138</pub-id>
          <pub-id pub-id-type="pmcid">PMC3017382</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adabor</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Acquaah-Mensah</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Oduro</surname>
              <given-names>FT</given-names>
            </name>
          </person-group>
          <article-title>SAGA: a hybrid search algorithm for Bayesian Network structure learning of transcriptional regulatory networks</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>02</month>
          <volume>53</volume>
          <fpage>27</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(14)00191-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2014.08.010</pub-id>
          <pub-id pub-id-type="medline">25181467</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(14)00191-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Tumour-specific causal inference discovers distinct disease mechanisms underlying cancer subtypes</article-title>
          <source>Sci Rep</source>
          <year>2019</year>
          <month>09</month>
          <day>13</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>13225</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-019-48318-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-019-48318-7</pub-id>
          <pub-id pub-id-type="medline">31519988</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-019-48318-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6744493</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ha</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Baladandayuthapani</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Do</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Prognostic gene signature identification using causal structure learning: applications in kidney cancer</article-title>
          <source>Cancer Inform</source>
          <year>2015</year>
          <volume>14</volume>
          <issue>Suppl 1</issue>
          <fpage>23</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/abs/10.4137/CIN.S14873?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.4137/CIN.S14873</pub-id>
          <pub-id pub-id-type="medline">25861215</pub-id>
          <pub-id pub-id-type="pii">cin-suppl.1-2015-023</pub-id>
          <pub-id pub-id-type="pmcid">PMC4362630</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sachs</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Pe'er</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lauffenburger</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Nolan</surname>
              <given-names>GP</given-names>
            </name>
          </person-group>
          <article-title>Causal protein-signaling networks derived from multiparameter single-cell data</article-title>
          <source>Science</source>
          <year>2005</year>
          <month>04</month>
          <day>22</day>
          <volume>308</volume>
          <issue>5721</issue>
          <fpage>523</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1126/science.1105809</pub-id>
          <pub-id pub-id-type="medline">15845847</pub-id>
          <pub-id pub-id-type="pii">308/5721/523</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Achard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Salvador</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Whitcher</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Suckling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bullmore</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>A resilient, low-frequency, small-world human brain functional network with highly connected association cortical hubs</article-title>
          <source>J Neurosci</source>
          <year>2006</year>
          <month>01</month>
          <day>04</day>
          <volume>26</volume>
          <issue>1</issue>
          <fpage>63</fpage>
          <lpage>72</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jneurosci.org/cgi/pmidlookup?view=long&#38;pmid=16399673"/>
          </comment>
          <pub-id pub-id-type="doi">10.1523/JNEUROSCI.3874-05.2006</pub-id>
          <pub-id pub-id-type="medline">16399673</pub-id>
          <pub-id pub-id-type="pii">26/1/63</pub-id>
          <pub-id pub-id-type="pmcid">PMC6674299</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deshpande</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Investigating effective brain connectivity from fMRI data: past findings and current issues with reference to Granger causality analysis</article-title>
          <source>Brain Connect</source>
          <year>2012</year>
          <volume>2</volume>
          <issue>5</issue>
          <fpage>235</fpage>
          <lpage>45</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/23016794"/>
          </comment>
          <pub-id pub-id-type="doi">10.1089/brain.2012.0091</pub-id>
          <pub-id pub-id-type="medline">23016794</pub-id>
          <pub-id pub-id-type="pmcid">PMC3621319</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brovelli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ledberg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Nakamura</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bressler</surname>
              <given-names>SL</given-names>
            </name>
          </person-group>
          <article-title>Beta oscillations in a large-scale sensorimotor cortical network: directional influences revealed by Granger causality</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2004</year>
          <month>06</month>
          <day>29</day>
          <volume>101</volume>
          <issue>26</issue>
          <fpage>9849</fpage>
          <lpage>54</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/15210971"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.0308538101</pub-id>
          <pub-id pub-id-type="medline">15210971</pub-id>
          <pub-id pub-id-type="pii">0308538101</pub-id>
          <pub-id pub-id-type="pmcid">PMC470781</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bielczyk</surname>
              <given-names>NZ</given-names>
            </name>
            <name name-style="western">
              <surname>Uithol</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>van Mourik</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Glennon</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Buitelaar</surname>
              <given-names>JK</given-names>
            </name>
          </person-group>
          <article-title>Disentangling causal webs in the brain using functional magnetic resonance imaging: a review of current approaches</article-title>
          <source>Netw Neurosci</source>
          <year>2019</year>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>237</fpage>
          <lpage>73</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30793082"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/netn_a_00062</pub-id>
          <pub-id pub-id-type="medline">30793082</pub-id>
          <pub-id pub-id-type="pii">netn_a_00062</pub-id>
          <pub-id pub-id-type="pmcid">PMC6370462</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kwak</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rho</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Personality factors predicting smartphone addiction predisposition: behavioral inhibition and activation systems, impulsivity, and self-control</article-title>
          <source>PLoS One</source>
          <year>2016</year>
          <volume>11</volume>
          <issue>8</issue>
          <fpage>e0159788</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0159788"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0159788</pub-id>
          <pub-id pub-id-type="medline">27533112</pub-id>
          <pub-id pub-id-type="pii">PONE-D-15-35202</pub-id>
          <pub-id pub-id-type="pmcid">PMC4988723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Savitz</surname>
              <given-names>SI</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Counterfactual analysis of differential comorbidity risk factors in Alzheimer’s disease and related dementias</article-title>
          <source>PLOS Digit Health</source>
          <year>2022</year>
          <month>3</month>
          <day>15</day>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>e0000018</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Neufeld</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Judea Pearl. Probabilistic reasoning in intelligent systems: networks of plausible inference. Series in representation and reasoning. Morgan Kaufmann, San Mateo1988, xix + 552 pp</article-title>
          <source>J Symb Log</source>
          <year>2014</year>
          <month>03</month>
          <day>12</day>
          <volume>58</volume>
          <issue>2</issue>
          <fpage>721</fpage>
          <pub-id pub-id-type="doi">10.2307/2275238</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chickering</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Learning Bayesian networks is NP-complete</article-title>
          <source>Learning from Data</source>
          <year>1996</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <article-title>Bayesian network repository</article-title>
          <source>BN Learn</source>
          <access-date>2022-03-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bnlearn.com/bnrepository/">https://www.bnlearn.com/bnrepository/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ling</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Upadhyaya</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Heterogeneous treatment effect estimation using machine learning for healthcare application: tutorial and benchmark internet</article-title>
          <source>arXiv</source>
          <year>2021</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2109.12769"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Glymour</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Spirtes</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Review of causal discovery methods based on graphical models</article-title>
          <source>Front Genet</source>
          <year>2019</year>
          <volume>10</volume>
          <fpage>524</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3389/fgene.2019.00524"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fgene.2019.00524</pub-id>
          <pub-id pub-id-type="medline">31214249</pub-id>
          <pub-id pub-id-type="pmcid">PMC6558187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vowels</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Camgoz</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Bowden</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>D'ya like DAGs? A survey on structure learning and causal discovery</article-title>
          <source>arXiv</source>
          <year>2021</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2103.02582"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3527154</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spirtes</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Glymour</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>An algorithm for fast recovery of sparse causal graphs</article-title>
          <source>Social Sci Comput Rev</source>
          <year>2016</year>
          <month>08</month>
          <day>18</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>62</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1177/089443939100900106</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Colombo</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Maathuis</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Order-independent constraint-based causal structure learning</article-title>
          <source>arXiv</source>
          <year>2012</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1211.3295"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Runge</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nowack</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kretschmer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Flaxman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sejdinovic</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Detecting and quantifying causal associations in large nonlinear time series datasets</article-title>
          <source>Sci Adv</source>
          <year>2019</year>
          <month>11</month>
          <volume>5</volume>
          <issue>11</issue>
          <fpage>eaau4996</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https:///www.science.org/doi/10.1126/sciadv.aau4996?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1126/sciadv.aau4996</pub-id>
          <pub-id pub-id-type="medline">31807692</pub-id>
          <pub-id pub-id-type="pii">aau4996</pub-id>
          <pub-id pub-id-type="pmcid">PMC6881151</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Runge</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Recent progress and new methods for detecting causal relations in large nonlinear time series datasets</article-title>
          <source>Proceedings of the EGU General Assembly 2020</source>
          <year>2020</year>
          <conf-name>EGU General Assembly 2020</conf-name>
          <conf-date>May 4–8, 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <pub-id pub-id-type="doi">10.5194/egusphere-egu2020-9554</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pearl</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>Causality</source>
          <year>2009</year>
          <publisher-loc>Cambridge, England</publisher-loc>
          <publisher-name>Cambridge University Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Verma</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Graphical aspects of causal models</article-title>
          <source>UCLA</source>
          <year>1992</year>
          <month>9</month>
          <day>1</day>
          <access-date>2021-02-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ftp.cs.ucla.edu/pub/stat_ser/r191.pdf">https://ftp.cs.ucla.edu/pub/stat_ser/r191.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spirtes</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Glymour</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Scheines</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>Causation, Prediction, and Search</source>
          <year>2000</year>
          <publisher-loc>Cambridge, Massachusetts</publisher-loc>
          <publisher-name>The MIT Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Verma</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Pearl</surname>
              <given-names>j</given-names>
            </name>
          </person-group>
          <article-title>Equivalence and synthesis of causal models</article-title>
          <source>Proceedings of the Sixth Annual Conference on Uncertainty in Artificial Intelligence</source>
          <year>1990</year>
          <conf-name>UAI '90: Proceedings of the Sixth Annual Conference on Uncertainty in Artificial Intelligence</conf-name>
          <conf-date>Jul 27 - 29, 1990</conf-date>
          <conf-loc>Cambridge, MA, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.5555/647233.719736"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Escalante</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Escalera</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Guyon</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Baró</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Güçlütürk</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Güçlü</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>van Gerven</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Explainable and Interpretable Models in Computer Vision and Machine Learning</source>
          <year>2018</year>
          <publisher-loc>Cham,Basel, Switzerland</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buntine</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Theory refinement on Bayesian networks</article-title>
          <source>arXiv</source>
          <year>2013</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1303.5709"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/b978-1-55860-203-8.50010-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Andrews</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ramsey</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>GF</given-names>
            </name>
          </person-group>
          <article-title>Scoring bayesian networks of mixed variables</article-title>
          <source>Int J Data Sci Anal</source>
          <year>2018</year>
          <month>08</month>
          <day>11</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>18</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30140730"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s41060-017-0085-7</pub-id>
          <pub-id pub-id-type="medline">30140730</pub-id>
          <pub-id pub-id-type="pmcid">PMC6101981</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chickering</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Optimal structure identification with greedy search</article-title>
          <source>J Mach Learn Res</source>
          <year>2002</year>
          <volume>3</volume>
          <fpage>507</fpage>
          <lpage>54</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume3/chickering02b/chickering02b.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meek</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Causal inference and causal explanation with background knowledge</article-title>
          <source>arXiv</source>
          <year>2013</year>
          <fpage>403</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1302.4972"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-94-009-7731-0_8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ramsey</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Scaling up greedy causal search for continuous variables</article-title>
          <source>arXiv</source>
          <year>2015</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1507.07749"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>GF</given-names>
            </name>
            <name name-style="western">
              <surname>Herskovits</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>A Bayesian method for the induction of probabilistic networks from data</article-title>
          <source>Mach Learn</source>
          <year>1992</year>
          <month>10</month>
          <volume>9</volume>
          <issue>4</issue>
          <fpage>309</fpage>
          <lpage>47</lpage>
          <pub-id pub-id-type="doi">10.1007/bf00994110</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsamardinos</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>LE</given-names>
            </name>
            <name name-style="western">
              <surname>Aliferis</surname>
              <given-names>CF</given-names>
            </name>
          </person-group>
          <article-title>The max-min hill-climbing Bayesian network structure learning algorithm</article-title>
          <source>Mach Learn</source>
          <year>2006</year>
          <month>3</month>
          <day>28</day>
          <volume>65</volume>
          <issue>1</issue>
          <fpage>31</fpage>
          <lpage>78</lpage>
          <pub-id pub-id-type="doi">10.1007/s10994-006-6889-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsamardinos</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Aliferis</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Statnikov</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Time and sample efficient discovery of Markov blankets and direct causal relations</article-title>
          <source>Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining</source>
          <year>2003</year>
          <conf-name>KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining</conf-name>
          <conf-date>Aug 24 - 27, 2003</conf-date>
          <conf-loc>Washington, D.C</conf-loc>
          <pub-id pub-id-type="doi">10.1145/956750.956838</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hoyer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hyvarinen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kerminen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A linear non-Gaussian acyclic model for causal discovery</article-title>
          <source>J Mach Learn Res</source>
          <year>2006</year>
          <volume>7</volume>
          <fpage>2003</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume7/shimizu06a/shimizu06a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hyvarinen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kano</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hoyer</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Discovery of non-gaussian linear causal models using ICA</article-title>
          <source>arXiv</source>
          <year>2012</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1207.1413"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Inazumi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sogawa</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hyvarinen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kawahara</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Washio</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hoyer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bollen</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>DirectLiNGAM: a direct method for learning a linear non-Gaussian structural equation model</article-title>
          <source>arXiv</source>
          <year>2011</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1101.2489"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hyvärinen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>SM</given-names>
            </name>
          </person-group>
          <article-title>Pairwise likelihood ratios for estimation of non-Gaussian structural equation models</article-title>
          <source>J Mach Learn Res</source>
          <year>2013</year>
          <month>01</month>
          <volume>14</volume>
          <issue>Jan</issue>
          <fpage>111</fpage>
          <lpage>52</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31695580"/>
          </comment>
          <pub-id pub-id-type="medline">31695580</pub-id>
          <pub-id pub-id-type="pmcid">PMC6834441</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hoyer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Janzing</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mooij</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schölkopf</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Nonlinear causal discovery with additive noise models</article-title>
          <source>Proceedings of the Advances in Neural Information Processing Systems 21 (NIPS 2008)</source>
          <year>2008</year>
          <conf-name>Advances in Neural Information Processing Systems 21 (NIPS 2008)</conf-name>
          <conf-date>Dec 8-11, 2008</conf-date>
          <conf-loc>Vancouver BC Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Danieletto</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Miotto</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cherng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Scaling structural learning with NO-BEARS to infer causal transcriptome networks</article-title>
          <source>arXiv</source>
          <year>2019</year>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Dan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Aragam</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ravikumar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Learning sparse nonparametric DAGs</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1909.13189"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>DAG-GNN: DAG structure learning with graph neural networks</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1904.10098"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>DAGs with no fears: a closer look at continuous optimization for learning Bayesian networks</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2010.09133"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>A graph autoencoder approach to causal structure learning</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1911.07420"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lachapelle</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brouillard</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Deleu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lacoste-Julien</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Gradient-based neural DAG learning</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/pdf?id=rklbKA4YDS"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goudet</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Kalainathan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Caillou</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Guyon</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez-Paz</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sebag</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Learning functional causal models with generative neural networks</article-title>
          <source>arXiv</source>
          <year>2018</year>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kalainathan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Goudet</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Guyon</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez-Paz</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sebag</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Structural agnostic modeling: adversarial learning of causal graphs</article-title>
          <source>arXiv</source>
          <year>2022</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://hal-lirmm.ccsd.cnrs.fr/UMR8623/hal-01864239v1"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Causal discovery with reinforcement learning</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1906.04477"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ramsey</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Glymour</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sanchez-Romero</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Glymour</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A million variables and more: the Fast Greedy Equivalence Search algorithm for learning high-dimensional graphical causal models, with an application to functional magnetic resonance images</article-title>
          <source>Int J Data Sci Anal</source>
          <year>2017</year>
          <month>03</month>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>121</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28393106"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s41060-016-0032-z</pub-id>
          <pub-id pub-id-type="medline">28393106</pub-id>
          <pub-id pub-id-type="pmcid">PMC5380925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Aragam</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ravikumar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>DAGs with NO TEARS: continuous optimization for structure learning</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1803.01422"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Barham</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Devin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghemawat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Irving</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Isard</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>TensorFlow: a system for large-scale machine learning</article-title>
          <source>Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation is sponsored by USENIX</source>
          <year>2016</year>
          <conf-name>12th USENIX Symposium on Operating Systems Design and Implementation is sponsored by USENIX</conf-name>
          <conf-date>Nov 2–4, 2016</conf-date>
          <conf-loc>Savannah, GA, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paszke</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gross</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Massa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lerer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bradbury</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chanan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Killeen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gimelshein</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Antiga</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Desmaison</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kopf</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>DeVito</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Raison</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tejani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chilamkurthy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Steiner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chintala</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>PyTorch: an imperative style, high-performance deep learning library</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper/2019/file/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Niu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Pairwise interactions among brain regions organize large-scale functional connectivity during execution of various tasks</article-title>
          <source>Neuroscience</source>
          <year>2019</year>
          <month>08</month>
          <day>01</day>
          <volume>412</volume>
          <fpage>190</fpage>
          <lpage>206</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neuroscience.2019.05.011</pub-id>
          <pub-id pub-id-type="medline">31181368</pub-id>
          <pub-id pub-id-type="pii">S0306-4522(19)30327-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Butte</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>IS</given-names>
            </name>
          </person-group>
          <article-title>Mutual information relevance networks: functional genomic clustering using pairwise entropy measurements</article-title>
          <source>Pac Symp Biocomput</source>
          <year>2000</year>
          <fpage>418</fpage>
          <lpage>29</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://psb.stanford.edu/psb-online/proceedings/psb00/abstracts/p418.html"/>
          </comment>
          <pub-id pub-id-type="doi">10.1142/9789814447331_0040</pub-id>
          <pub-id pub-id-type="medline">10902190</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A fast PC algorithm with reversed-order pruning and a parallelization strategy</article-title>
          <source>arXiv</source>
          <year>2021</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2109.04626"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ackermann</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mohan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kjellström</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Causal discovery in the presence of missing data</article-title>
          <source>arXiv</source>
          <year>2020</year>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sanchez-Romero</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Glymour</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schölkopf</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Behind distribution shift: mining driving forces of changes and causal arrows</article-title>
          <source>Proc IEEE Int Conf Data Min</source>
          <year>2017</year>
          <month>11</month>
          <volume>2017</volume>
          <fpage>913</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31068766"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/ICDM.2017.114</pub-id>
          <pub-id pub-id-type="medline">31068766</pub-id>
          <pub-id pub-id-type="pmcid">PMC6502242</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
