<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i10e18395</article-id>
      <article-id pub-id-type="pmid">33006565</article-id>
      <article-id pub-id-type="doi">10.2196/18395</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Phenotypically Similar Rare Disease Identification from an Integrative Knowledge Graph for Data Harmonization: Preliminary Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mohammad Gholi Mezerji</surname>
            <given-names>Naser</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yang</surname>
            <given-names>Jiaoyun</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zhu</surname>
            <given-names>Qian</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Division of Pre-Clinical Innovation</institution>
            <institution>National Center for Advancing Translational Sciences (NCATS)</institution>
            <institution>National Institutes of Health (NIH)</institution>
            <addr-line>9800 Medical Center Drive</addr-line>
            <addr-line>Rockville, MD, 20850</addr-line>
            <country>United States</country>
            <phone>1 301 4807841</phone>
            <email>qian.zhu@nih.gov</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4858-6333</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Nguyen</surname>
            <given-names>Dac-Trung</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2591-9948</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Alyea</surname>
            <given-names>Gioconda</given-names>
          </name>
          <degrees>MS, MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9310-0163</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Hanson</surname>
            <given-names>Karen</given-names>
          </name>
          <degrees>MS, MBA</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2253-1043</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Sid</surname>
            <given-names>Eric</given-names>
          </name>
          <degrees>MD, MHA</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7697-3026</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Pariser</surname>
            <given-names>Anne</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9421-0126</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Division of Pre-Clinical Innovation</institution>
        <institution>National Center for Advancing Translational Sciences (NCATS)</institution>
        <institution>National Institutes of Health (NIH)</institution>
        <addr-line>Rockville, MD</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>ICF International Inc</institution>
        <addr-line>Rockville, MD</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Office of Rare Diseases Research (ORDR)</institution>
        <institution>National Center for Advancing Translational Sciences (NCATS)</institution>
        <institution>National Institutes of Health (NIH)</institution>
        <addr-line>Bethesda, MD</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Qian Zhu <email>qian.zhu@nih.gov</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>10</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>10</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>10</issue>
      <elocation-id>e18395</elocation-id>
      <history>
        <date date-type="received">
          <day>24</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>9</day>
          <month>6</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>2</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>19</day>
          <month>8</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Qian Zhu, Dac-Trung Nguyen, Gioconda Alyea, Karen Hanson, Eric Sid, Anne Pariser. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 02.10.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2020/10/e18395" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Although many efforts have been made to develop comprehensive disease resources that capture rare disease information for the purpose of clinical decision making and education, there is no standardized protocol for defining and harmonizing rare diseases across multiple resources. This introduces data redundancy and inconsistency that may ultimately increase confusion and difficulty for the wide use of these resources. To overcome such encumbrances, we report our preliminary study to identify phenotypical similarity among genetic and rare diseases (GARD) that are presenting similar clinical manifestations, and support further data harmonization.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>To support rare disease data harmonization, we aim to systematically identify phenotypically similar GARD diseases from a disease-oriented integrative knowledge graph and determine their similarity types.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We identified phenotypically similar GARD diseases programmatically with 2 methods: (1) We measured disease similarity by comparing disease mappings between GARD and other rare disease resources, incorporating manual assessment; 2) we derived clinical manifestations presenting among sibling diseases from disease classifications and prioritized the identified similar diseases based on their phenotypes and genotypes.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>For disease similarity comparison, approximately 87% (341/392) identified, phenotypically similar disease pairs were validated; 80% (271/392) of these disease pairs were accurately identified as phenotypically similar based on similarity score. The evaluation result shows a high precision (94%) and a satisfactory quality (86% F measure). By deriving phenotypical similarity from Monarch Disease Ontology (MONDO) and Orphanet disease classification trees, we identified a total of 360 disease pairs with at least 1 shared clinical phenotype and gene, which were applied for prioritizing clinical relevance. A total of 662 phenotypically similar disease pairs were identified and will be applied for GARD data harmonization.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We successfully identified phenotypically similar rare diseases among the GARD diseases via 2 approaches, disease mapping comparison and phenotypical similarity derivation from disease classification systems. The results will not only direct GARD data harmonization in expanding translational science research but will also accelerate data transparency and consistency across different disease resources and terminologies, helping to build a robust and up-to-date knowledge resource on rare diseases.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>GARD</kwd>
        <kwd>rare diseases</kwd>
        <kwd>phenotypical similarity</kwd>
        <kwd>data harmonization</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>A rare disease in the United States is defined by the 1983 Orphan Drug Act as a condition that affects fewer than 200,000 people [<xref ref-type="bibr" rid="ref1">1</xref>], whereas the analogous legislation introduced in the European Union in 2000 considers a disease to be rare when it affects fewer than 1 in 2,000 people [<xref ref-type="bibr" rid="ref2">2</xref>]. In comparison to common diseases, health care providers are challenged by a lack of familiarity with diagnosing and treating rare diseases, which can lead to missed, delayed, or inaccurate diagnoses even when an approved, effective therapy is available [<xref ref-type="bibr" rid="ref3">3</xref>]. Improved understanding and recognition of rare diseases are key for accurate and timely diagnosis, and this relies on broad dissemination of and access to knowledge about rare diseases [<xref ref-type="bibr" rid="ref4">4</xref>]. A huge amount of effort has been made to develop rare disease resources for patients, families, and clinicians, such as the Genetic and Rare Diseases Information Center (GARD) [<xref ref-type="bibr" rid="ref5">5</xref>], Orphanet [<xref ref-type="bibr" rid="ref6">6</xref>], and Monarch Disease Ontology (MONDO) [<xref ref-type="bibr" rid="ref7">7</xref>]; however, disparate data and incomplete data harmonization are still major barriers to improved coordination across specialists, leading to inefficiencies and delays in diagnosis, care, and treatment. This is exemplified by the difficulty faced in accurately answering the question, <italic>how many total rare diseases are there?</italic> A recent report by Haendel et al [<xref ref-type="bibr" rid="ref8">8</xref>], after an examination of multiple rare resources, concluded that “there are total of 10,393 rare diseases in MONDO…the majority, 6370 rare diseases, are presented in three or more resources, whereas 4023 are unique to one source.” The fact that more than one-third of rare diseases are unique to 1 source highlights a reality that those resources continue to use their own disease definitions or harmonization rules to develop their rare disease vocabularies. Insufficient effort put toward data harmonization ultimately leads to redundancy in categorization efforts and a resulting inconsistency of rare disease representation globally.</p>
      <p>The goal of data harmonization is to improve the compatibility of data collected from independent sources (horizontally) in order to better understand disease etiology from different angles, which may forward the discovery of therapeutic approaches for rare diseases. For each individual source, data harmonization is crucial to better represent and organize data for supporting data harmonization horizontally. Current data harmonization efforts are primarily aligning standard nomenclatures or human efforts to translate specific medical and clinical features into a standardized and sharable format. For instance, Pontikos et al [<xref ref-type="bibr" rid="ref9">9</xref>] introduced Phenooplis, an open platform for the harmonization and analysis of genetic and phenotypic data that harmonize phenotypes with the help of Human Phenotype Ontology (HPO). The International Cancer Genome Consortium (ICGC) and The Cancer Genome Atlas (TCGA) invited the cancer-genomics and bioinformatics communities to work together to identify the best pipelines for the detection of mutations in DNA-sequencing reads for cancer genomes in order to facilitate the harmonization of mutation-calling procedures among institutions [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Orphanet and OMIM (Online Mendelian Inheritance of Man) heavily relied on human efforts for their data curation and harmonization [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. To avoid cumbersome human efforts and a lack of rare disease standards in this study, we proposed to systematically identify phenotypically similar rare diseases from GARD and determine their similarity types, including duplicate diseases, sibling diseases, and subtypes for supporting rare disease data harmonization.</p>
      <p>Rare disease designations are often in conflict across different datasets due to the differing statutory requirements used in defining a rare disease in different countries, and as such, useful methods to improve interoperability across these broad terminologies and standards are required. With the aim of eliminating data redundancy and inconsistency across different resources, improving data interoperability, and facilitating data harmonization, the implementation of a knowledge graph is capable of semantically organizing and integrating complex networks of data into one collection. Knowledge graphs have been widely applied in the medical domain and in the rare disease field. For instance, Reumann et al [<xref ref-type="bibr" rid="ref14">14</xref>] reported their solution for cognitive differential diagnosis (DDx) in rare diseases based on knowledge graph technology that incorporates data from ICD-10, DOID, medDRA, PubMed, Wikipedia, Orphanet, the CDC, and anonymized patient data. Li et al [<xref ref-type="bibr" rid="ref15">15</xref>] presented their work to develop a rare disease classification algorithm established on a knowledge graph. Sosa et al [<xref ref-type="bibr" rid="ref16">16</xref>] applied a knowledge graph–embedding method that explicitly models the uncertainty associated with literature-derived relationships and uses link prediction to generate drug repurposing hypotheses for rare diseases. In this study, we accessed data from an integrative knowledge graph that we developed from our previous study [<xref ref-type="bibr" rid="ref17">17</xref>] with a variety of rare disease-related resources for phenotypical similarity identification among GARD diseases.</p>
      <p>In this study, we report our preliminary work to identify phenotypically similar GARD diseases from an integrative knowledge graph using 2 approaches: (1) disease mapping comparison, and (2) phenotypical similarity derivation from disease classification systems. This effort will not only direct GARD data harmonization but will also support data harmonization across different resources, and eventually support clinical decision making. Phenotypically similar GARD diseases applied in this study specifically refer to disease subtypes and sibling diseases that share similar clinical manifestations. For example, 2 GARD diseases of “lactate dehydrogenase deficiency” and “lactate dehydrogenase A deficiency” are subtypes, and they have similar phenotypical profiles.</p>
      <sec>
        <title>Background and Materials</title>
        <sec>
          <title>Rare Disease Resources</title>
          <p>The Genetic and Rare Diseases Information Center (GARD) is a program managed by the National Center for Advancing Translational Sciences (NCATS), National Institutes of Health (NIH). Since 2003, GARD has provided the public with access to current, reliable, and easy-to-understand information about rare and genetic diseases [<xref ref-type="bibr" rid="ref5">5</xref>]. As part of the data harmonization effort toward furthering the development of the GARD, we harmonized GARD diseases according to their phenotypical similarity in this study. To fulfill this task, we assessed phenotypical similarity among GARD diseases by leveraging several well-known disease resources, including Orphanet, OMIM, MONDO, the HPO, and the UMLS (Unified Medical Language System), owing to their complementary focus and coverage. We briefly describe these applied resources below.</p>
          <p>Orphanet is an EU resource that focuses on gathering and improving knowledge on rare diseases [<xref ref-type="bibr" rid="ref6">6</xref>]. Rare diseases in the Orphanet, depending on their clinical presentation, are included in as many classifications as needed. The Orphanet classification is organized according to three hierarchical levels: group of disorders, disorder, and subtype of a disorder. The disorder level is designated as the main topologic level for each clinical entity characterized by a set of homogeneous phenotypic abnormalities and evolution, allowing for a definitive clinical diagnosis [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p>
          <p>OMIM (Online Mendelian Inheritance in Man<bold>)</bold> is a comprehensive, authoritative compendium of human genes and genetic phenotypes that is freely available and updated daily. It contains information on all known mendelian disorders and over 15,000 genes. OMIM focuses on the relationship between phenotype and genotype [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
          <p>MONDO (Monarch Disease Ontology) aims to harmonize disease definitions across the world. It is a semi-automatically constructed ontology that merges multiple disease resources to yield a coherent merged ontology. One feature of the MONDO is that it curates precise 1-to-1 equivalence axioms connecting to other resources, validated by OWL reasoning [<xref ref-type="bibr" rid="ref7">7</xref>]. MONDO provides a hierarchical structure that can be used for classification or for rolling up diseases to higher-level groupings.</p>
          <p>The Human Phenotype Ontology (HPO) provides a standardized vocabulary of phenotypic abnormalities encountered in human disease. HPO currently contains over 13,000 terms and over 156,000 annotations to hereditary diseases [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
          <p>The Unified Medical Language System (UMLS) is a terminology integration system developed at the National Library of Medicine (NLM). The UMLS Metathesaurus integrates more than 160 biomedical vocabularies. Synonymous terms from the various source vocabularies are grouped into one concept [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        </sec>
        <sec>
          <title>An Integrative Knowledge Graph</title>
          <p>We previously developed an integrative knowledge graph with 34 different biomedical data resources at the time of writing, including the aforementioned resources. This graph database is hosted in Neo4j and is publicly accessible without login credentials [<xref ref-type="bibr" rid="ref17">17</xref>]. In this study, we accessed this knowledge graph to obtain data from the aforementioned resources and applied it for the measurement of phenotypical similarity among GARD diseases.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>In this study, we aimed to identify phenotypical similarity among rare diseases to support data harmonization and data interoperability with existing standardized terminologies and ontologies. We designed two complementary approaches: (1) analysis of disease mappings to Orphanet, OMIM, and the UMLS to measure phenotypical similarity among GARD diseases; (2) prioritizing phenotypical similarity derived from MONDO and Orphanet disease classification systems with shared phenotypes from the HPO and genes from OMIM. The architecture of this study is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>The architecture of phenotypically similar GARD disease identification.</p>
        </caption>
        <graphic xlink:href="medinform_v8i10e18395_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <sec>
        <title>Phenotypical Similarity Identification Based on GARD Disease Mappings</title>
        <p>In order to identify phenotypical similarity, we computed disease similarity among disease mappings between GARD diseases and disease concepts from Orphanet, OMIM, and the UMLS, which offer a wide spectrum of characteristics of rare diseases—in Orphanet, diseases are defined upon their clinical presentation; in OMIM, disease definition is based on genetic etiology; in UMLS, a broader biomedical definition of diseases is offered.</p>
      </sec>
      <sec>
        <title>Disease Mapping Retrieval from the Knowledge Graph</title>
        <p>We obtained disease mappings from the aforementioned knowledge graph. There are 2 ways to retrieve disease mappings for GARD diseases from the knowledge graph: (1) by developing mappings based on specific edge properties; for instance, 2 concepts with the same concept names are mapped via one edge property of “N_Name”; (2) by extracting mappings directly from GARD disease nodes, which store GARD-curated external mappings to Orphanet, OMIM, and the UMLS. To ensure mapping quality, we performed the second approach by accessing 1 node property of I_CODE and storing external mappings for each GARD disease node. For instance, 3 external mappings, including “OMIM:603358,” “ORPHANET:53693,” and “UMLS:C1864002” for the GARD disease of “GRACILE SYNDROME(GARD:0000001),” are stored in its property of “I_CODE” and can be retrieved by executing the following Cypher Query 1 [<xref ref-type="bibr" rid="ref23">23</xref>], which is Neo4j's graph query language that allows users to store and retrieve data from the graph database:</p>
        <p><bold>Cypher Query 1.</bold> match P = (n:S_GARD<sup>a</sup>) where any (x in n.I_CODE where x= “GARD:0000001”) return n.I_CODE</p>
        <p><sup>a</sup>S_GARD referring to GARD data</p>
        <p>We executed the Cypher Queries listed in <xref ref-type="table" rid="table1">Table 1</xref> to retrieve disease mappings for GARD diseases. Each GARD disease obtains zero to multiple mappings accordingly. For instance, “Gracile Syndrome (GARD:0000001)” has the 3 disease mappings described above; however, “Acalvaria ( GARD:0000361)” only has 1 mapping, “ORPHANET:945.” To ensure that each GARD disease was associated with at least 1 mapping for similarity measurement, we excluded 1498 GARD diseases with no mappings to any of these 3 resources.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Disease mapping extraction from the Neo4j knowledge graph.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="800"/>
            <thead>
              <tr valign="top">
                <td>Disease mappings</td>
                <td>Cypher Queries</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GARD2Orphanet</td>
                <td>match P = (n:S_GARD) where any (x in n.I_CODE where x=~ ‘ORPHA.*’)return distinct n.I_CODE</td>
              </tr>
              <tr valign="top">
                <td>GARD2OMIM</td>
                <td>match P = (n:S_GARD) where any (x in n.I_CODE where x=~ ‘OMIM.*’)return distinct n.I_CODE</td>
              </tr>
              <tr valign="top">
                <td>GARD2UMLS</td>
                <td>match P = (n:S_GARD) where any (x in n.I_CODE where x=~ UMLS.*’)return distinct n.I_CODE</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Calculating Similarity to Prioritize Phenotypical Similarity of GARD Disease Pairs</title>
        <p>In order to compare phenotypical similarity among the GARD diseases based on their similarity, we enumerated all mappings obtained for 5236 GARD diseases and ended with a total of 9672 mappings. For each GARD disease, we generated fingerprints based on those mappings. One disease mapping corresponding to one binary fingerprint, with presence denoted as 1 and absence denoted as 0. To this end, each GARD disease was represented as a vector of 9672 bits. Then, we calculated cosine similarity [<xref ref-type="bibr" rid="ref24">24</xref>] for each pair of GARD diseases based on their fingerprints. For those disease pairs without any shared mappings, which means their similarity score equals 0, we excluded them for manual similarity identification.</p>
      </sec>
      <sec>
        <title>Phenotypically Similar GARD Disease Identification</title>
        <p>To determine the phenotypical similarity of GARD diseases, our subject matter experts (GA, KH, and ES) manually evaluated the prioritized disease pairs based on their similarity scores generated from the above step. The manual validation was not only attempting to examine the accuracy of computational results to establish business rules for further GARD data harmonization, but also to validate correctness and coverage of the GARD-curated external mappings.</p>
        <p>The manual review process consisted of 3 steps: (1) categorizing GARD disease pairs to phenotypical similarity types, namely “Duplicates,” “Subtypes,” “Siblings,” and “Unrelated;” (2) researching the latest epidemiology studies (eg, PubMed articles, trusted resources) for each disease if applicable, to re-evaluate the qualification of RARE disease based on the US definition of rare disease [<xref ref-type="bibr" rid="ref1">1</xref>]; (3) documenting the decision-making process for future reference. As an example demonstrating this review process, “Testicular Cancer (GARD:0007746)” and “Testicular germ cell tumor (GARD:0013047),” with a similarity score of 0.71, were initially grouped as subtypes. However, researching the latest epidemiological data for testicular cancer uncovered that “in 2017, there were an estimated 269,769 men living with testicular cancer in the United States” [<xref ref-type="bibr" rid="ref25">25</xref>]; this indicates that the prevalence rate of testicular cancer does not meet (ie, is higher than) the US definition of rare diseases, and so it was marked to “Retire.”</p>
        <p>In this context, we defined <italic>precision</italic> as the fraction between the number of correctly identified phenotypically similar disease pairs based on manual evaluation and the total number of similar disease pairs identified; we defined <italic>recall</italic> as the fraction between the number of correctly identified phenotypically similar disease pairs and the total number of similar disease pairs; we defined <italic>F measure</italic> as the balanced harmonic mean of the precision and recall. We computed precision, recall, and F measure to measure the performance of this approach.</p>
      </sec>
      <sec>
        <title>Phenotypical Similarity Derivation from Disease Classification Systems</title>
        <p>Diseases from the same disease category exhibit a high phenotypic homogeneity [<xref ref-type="bibr" rid="ref26">26</xref>]; we assume that phenotypical similarity is evidently presenting among sibling diseases, which share the same parent diseases in disease classification systems. To further prove our assumption by assessing 3 disease classification systems, including GARD, MONDO, and Orphanet, we developed a web application to search and review a specific disease term presenting in these 3 disease trees to perform a comparison. This web application is publicly accessible [<xref ref-type="bibr" rid="ref27">27</xref>]. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows one screenshot of the search results for “Wilson disease.” MONDO and Orphanet have more refined and complete disease classifications than the GARD, which enables phenotypical similarity identification for GARD diseases.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Disease tree visualization via the GARD Data Tree web tool.</p>
          </caption>
          <graphic xlink:href="medinform_v8i10e18395_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Retrieving Phenotypically Similar GARD Diseases</title>
        <p>With the help of the GARD Data Tree web tool, we were able to form a process of deriving phenotypical similarity among GARD diseases in 3 steps: (1) mapping GARD diseases to MONDO and Orphanet; (2) extracting all sibling diseases of the mapped MONDO and Orphanet diseases from their disease trees; and (3) mapping the retrieved sibling diseases back to the GARD. The GARD diseases retrieved from the third step should be phenotypically similar to the query GARD disease from the first step. We further validated them by leveraging their associated phenotypes and genotypes.</p>
        <p>These 3 steps can be formalized in Cypher Queries accordingly; examples are shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. After obtaining mappings between GARD and Orphanet/MONDO by executing Cypher Query 1 shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, we searched parent diseases of those mapped MONDO and Orphanet diseases. Cypher Query 2 is an example of extracting Orphanet parent diseases for the Orphanet concept “Wilson Disease (ORPHA:905),” which is mapped to “GARD:0007893” from Cypher Query 1. Cypher Query 3 demonstrates a process that extracts all child diseases for 1 Orphanet parent disease, “SUPRANUCLEAR EYE MOVEMENT DISORDER (ORPHANET:98687),” which is 1 parent node returned from Cypher Query 2, and maps those child Orphanet diseases to GARD diseases. In order to identify the most phenotypically similar GARD diseases obtained from Cypher Query 3 to the inquiry disease “Wilson Disease (GARD:0007893),” we prioritized similarity based on their associated phenotypes and genes.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Cypher query examples for extracting phenotypically similar GARD diseases by navigating Orphanet disease classification systems.</p>
          </caption>
          <graphic xlink:href="medinform_v8i10e18395_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Prioritizing Phenotypically Similar GARD Diseases Based on Phenotypes and Genotypes</title>
        <p>Given the fact that a majority of rare diseases are genetic in origin and that clinical phenotypes are one of the red flags increasing rare disease attentiveness in clinical practice [<xref ref-type="bibr" rid="ref28">28</xref>], we developed a protocol for prioritizing phenotypical similarity based on phenotypes and genotypes. We collected phenotypes from the HPO and genes from OMIM from our knowledge graph, for those similar GARD disease pairs identified from the above step. The number of phenotypes and genes shared by each pair of phenotypically similar GARD diseases was applied for prioritization.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Results of Disease Mapping Analysis</title>
        <sec>
          <title>Disease Concept Retrieval</title>
          <p>We extracted disease mappings between GARD and Orphanet, OMIM, and the UMLS from our Neo4j knowledge graph. The retrieval results are shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Results of disease mapping retrieval from Neo4j graph.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Types of mapping</td>
                  <td>Number of mappings</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>GARD2Orphanet</td>
                  <td>2,869</td>
                </tr>
                <tr valign="top">
                  <td>GARD2OMIM</td>
                  <td>3,500</td>
                </tr>
                <tr valign="top">
                  <td>GARD2UMLS</td>
                  <td>3,584</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Disease Similarity Calculation</title>
          <p>We enumerated disease pairs for 5236 GARD diseases with disease mappings and calculated cosine similarity for those GARD pairs. After excluding those disease pairs with similarity equaling 0, 392 diseases pairs remained. <xref ref-type="table" rid="table3">Table 3</xref> summarizes the results of the similarity calculation.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Similarity calculation results for disease pairs (n=392).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Similarity scores</td>
                  <td>Number of disease pairs</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>34</td>
                </tr>
                <tr valign="top">
                  <td>0.5 &#60;= Similarity &#60; 1</td>
                  <td>264</td>
                </tr>
                <tr valign="top">
                  <td>0 &#60; Similarity &#60; 0.5</td>
                  <td>94</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Evaluation and Disease Similarity Identification</title>
          <p>Our subject matter experts manually reviewed these 392 disease pairs and assigned their similarity types accordingly. <xref ref-type="table" rid="table4">Table 4</xref> shows their review results.</p>
          <p>Of the 392 disease pairs, 341 (87%) were identified and categorized as phenotypically similar, corresponding to the categories “Duplicated,” “Siblings,” and “Subtypes.” Of those 341 disease pairs, 271 disease pairs (80%) with similarity scores greater than 0.5 were verified as phenotypically similar. However, 34 disease pairs were determined to be “Unrelated,” and another 17 disease pairs were “Ungrouped;” this needs further discussion, and so we excluded the latter group for calculations of precision, recall, and F measure.</p>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Manual review results for the disease pairs (n=392); precision=94%, recall=79%, F measure=86%.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="30"/>
              <col width="340"/>
              <col width="130"/>
              <col width="100"/>
              <col width="0"/>
              <col width="110"/>
              <col width="0"/>
              <col width="130"/>
              <col width="0"/>
              <col width="130"/>
              <thead>
                <tr valign="top">
                  <td colspan="3">
                    Variables
                  </td>
                  <td colspan="8">
                    Phenotypical similarity types
                  </td>
                </tr>
                <tr valign="top">
                  <td colspan="3">
                    <break/>
                  </td>
                  <td>
                    Duplicated
                  </td>
                  <td colspan="2">
                    Siblings
                  </td>
                  <td colspan="2">
                    Subtypes
                  </td>
                  <td colspan="2">
                    Unrelated
                  </td>
                  <td>
                    Ungrouped
                  </td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="11">
                    <bold>Number of disease pairs, n</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td colspan="2">
                    Phenotypically similar (n=341)
                  </td>
                  <td>
                    105
                  </td>
                  <td>
                    117
                  </td>
                  <td colspan="2">
                    119
                  </td>
                  <td colspan="2">
                    N/A<sup>a</sup>
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td colspan="2">
                    Not phenotypically similar (n=51)
                  </td>
                  <td>
                    N/A
                  </td>
                  <td>
                    N/A
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td colspan="2">
                    34
                  </td>
                  <td colspan="2">
                    17
                  </td>
                </tr>
                <tr valign="top">
                  <td colspan="11">
                    <bold>Similarity scores, n</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td colspan="10">
                    <bold>Phenotypically similar (n=341)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    0.7≥Score≥1 (n=95)
                  </td>
                  <td>
                    47
                  </td>
                  <td colspan="2">
                    21
                  </td>
                  <td colspan="2">
                    27
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td>
                    N/A
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    0.5≥Score≥0.7 (n=176)
                  </td>
                  <td>
                    42
                  </td>
                  <td colspan="2">
                    81
                  </td>
                  <td colspan="2">
                    53
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td>
                    N/A
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    Score&#62;0.5 (n=70)
                  </td>
                  <td>
                    16
                  </td>
                  <td colspan="2">
                    15
                  </td>
                  <td colspan="2">
                    39
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td>
                    N/A
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td colspan="10">
                    <bold>Not phenotypically similar (n=51)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    0.7≥Score≥1 (n=16)
                  </td>
                  <td>
                    N/A
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td colspan="2">
                    8
                  </td>
                  <td>
                    8
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    0.5≥Score≥0.7 (n=15)
                  </td>
                  <td>
                    N/A
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td colspan="2">
                    8
                  </td>
                  <td>
                    7
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    Score&#62;0.5 (n=20)
                  </td>
                  <td>
                    N/A
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td colspan="2">
                    N/A
                  </td>
                  <td colspan="2">
                    18
                  </td>
                  <td>
                    2
                  </td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>N/A: not applicable.
                </p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Results of Phenotypical Similarity Derivation from Disease Classification Systems</title>
        <p>Based on the above analysis, 53 GARD diseases were marked for retirement. Of the remaining of 5955 GARD diseases, 4798 GARD diseases obtained 1 or more phenotypically similar GARD disease(s) from this step. The stepwise results are shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
        <p>Of 5286 GARD diseases mapped to one of 21,823 MONDO diseases with parent diseases, 4549 GARD diseases obtained phenotypically similar GARD diseases via MONDO sibling disease mappings. Of 2631 GARD diseases mapped to one of 7024 Orphanet diseases with parent diseases, 2459 GARD diseases obtained phenotypically similar GARD diseases via Orphanet sibling disease mappings. By combining these 2 lists of mappings, 4798 GARD diseases obtained phenotypically similar diseases. We paired these 4798 GARD diseases with identified phenotypically similar diseases and ended with unique 241,604 GARD disease pairs.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Results of phenotypically similar GARD disease retrieval based on MONDO and Orphanet disease classifications.</p>
          </caption>
          <graphic xlink:href="medinform_v8i10e18395_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Phenotypically Similar Disease Prioritization Based on Phenotypes and Genotypes</title>
          <p>Of the 241,604 disease pairs identified for these 4798 GARD diseases, 84,054 disease pairs shared at least 1 phenotype and 396 disease pairs shared at least 1 gene. By combing these 2 sets, there are 360 GARD disease pairs with at least 1 shared phenotype and gene. As all of those disease pairs were extracted from sibling diseases presenting in the MONDO and Orphanet, these 360 disease pairs were consequently grouped as “Siblings” with different degrees of phenotypical similarity based on the number of their shared phenotypes and genes.</p>
          <p>By combining 341 disease pairs identified from the step of disease mapping analysis, 662 disease pairs showed phenotypical similarity. It is worth noting that there are 39 overlaps between these 2 sets. Based on the manual evaluation shown in <xref ref-type="table" rid="table4">Table 4</xref>, these 39 pairs consist of 25 disease pairs that are sibling diseases, 7 disease pairs that are subtypes, 2 pairs that are duplicates, and 5 pairs that are unrelated diseases.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>In this study, we identified and prioritized phenotypical similarity among GARD diseases by comparing disease similarity and deriving phenotypical similarity from disease classification systems. As a proof-of-concept, we demonstrated the usefulness of the identified phenotypically similar disease pairs to support data harmonization for GARD. By incorporating these identified similar diseases, GARD will have the capability of supporting education and clinical decision making; for instance, GARD can provide more complementary information not only for the inquiry disease but also for phenotypically similar diseases.</p>
      <p>There are many different rare disease resources available, and each of them has their own strength and focus. OMIM classifies diseases based on their genetic cause, Orphanet defines rare diseases based on phenotypical characteristics, and UMLS incorporates biomedical vocabulary and standards to define their disease concepts. Given the complementary definition of disease concepts from these 3 resources, we employed their mappings to the GARD diseases for disease similarity comparison. Of the 392 disease pairs, 271 disease pairs (80%) with similarity scores greater than 0.5 were successfully validated as clinically relevant by our genetic specialists. Besides these true positives, feedback from our subject matter experts on the false positives [ie, 16 disease pairs (~4%) with similarity scores greater than 0.5 were manually determined as irrelevant] and false negatives [ie, 70 disease pairs (~18%) with similarity scores less than 0.5 were manually determined as relevant] illustrates that it is important to accurately capture the latest information in regard to disease mappings across different resources, and to incorporate human interpretations. For example, “Spondylothoracic dysostosis (GARD:0006798)” and “Spondylocostal dysostosis 1 (GARD:0010726)” share 3 of the same mappings, “ORPHA:2311,” “UMLS:C0265343,” and “OMIM:277300,” so their similarity score equals 1.0, indicating that they should be highly similar. However, our experts marked them as “Unrelated” due to the fact that these 2 conditions were grouped together in the past (both were previously referred to as Jarcho-Levin syndrome); they are considered as distinct conditions now, according to references from GHR (Genetic Home Reference) [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Berdon et al [<xref ref-type="bibr" rid="ref31">31</xref>] also discussed the clinical and radiological distinction between these 2 diseases. Another example is “Hunter Carpenter Macdonald syndrome (GARD:0002751)” and “Infantile neuroaxonal dystrophy (GARD:0003957),” which have a similarity score of 0.35, indicated they should be less relevant. However, it was marked as relevant by our experts given that PLA2G6-associated neurodegeneration (PLAN) comprises a continuum of 3 phenotypes with overlapping clinical and radiologic features for these 2 diseases, and similar evidence can be found at Orphanet [<xref ref-type="bibr" rid="ref32">32</xref>] that reveals that Hunter-Carpenter-McDonald syndrome has been moved to “Infantile neuroaxonal dystrophy.” In comparison of the total 13,705,230 GARD disease pairs, there are only 392 disease pairs with similarity scores greater than 0, which might direct the extension in 2 ways. First, 3 selected resources might not be comprehensive enough to cover all GARD diseases for disease similarity comparison based on their disease mappings. Therefore, we plan to extend our work with additional rare disease resources, such as MONDO, Disease Ontology, NCI Thesaurus, etc. Second, external disease mappings curated by GARD are accurate but might be incomplete due to cumbersome human effort. Thus, we will extend the disease mappings by inferring new associations via network analysis from the Neo4j knowledge graph.</p>
      <p>Phenotypical similarity derivation from disease classifications resulted in 360 disease pairs shared with at least 1 phenotype and gene, and they are grouped as sibling diseases. Among 241,604 disease pairs retrieved from the disease classification trees, there are 84,054 disease pairs that share at least 1 phenotype and 396 disease pairs that share at least 1 gene. Compared to the number of disease pairs with shared phenotypes, a relatively small number of disease pairs shared at least 1 gene; we are planning to obtain more genes for GARD diseases from other resources, including DisGeNet [<xref ref-type="bibr" rid="ref33">33</xref>] and ClinVar [<xref ref-type="bibr" rid="ref34">34</xref>]. Given the success we gained from this study in identifying phenotypical similarity derived from sibling diseases from disease classifications, we propose to extend this work with subtype diseases (ie, parent diseases and child diseases) by mining disease classifications. Once we have GARD diseases that we are able to assign to those relevant categories, we will develop our own disease classification system, which will not only define more accurate disease definitions and relationships among those diseases but will also serve as a unique, rare disease resource in the United States.</p>
      <p>By combining 2 sets generated by our 2 approaches, we identified 662 phenotypically similar disease pairs and mapped them to 4 phenotypical similarity types, namely, “Duplicates,” “Subtypes,” “Siblings,” and “Unrelated,” which will be applied to direct GARD data harmonization. To be specific, for “Duplicate” disease pairs, we will select and keep primary diseases in the GARD database; “Siblings” and “Subtypes” will direct GARD disease classification regeneration; for “Unrelated” diseases, we will keep these 2 diseases separately in the GARD database.</p>
      <p>By comparing these 2 sets, there are 39 overlapped disease pairs. These 39 disease pairs were grouped as “Siblings” by the second approach of disease classification derivation. However, based on the evaluation result (<xref ref-type="table" rid="table4">Table 4</xref>) from the first approach of disease mapping analysis, of these 39 disease pairs, 25 disease pairs were grouped as “Siblings,” 7 pairs were grouped as “Subtypes,” 2 pairs were grouped as “Duplicated,” and 5 pairs were grouped as “Unrelated.” For instance, “Malignant hyperthermia” and “King Denborough syndrome” are classified as sibling diseases by the second approach, since they are siblings in Orphanet, which groups them under the same disease parent class of “Rare Disease With Malignant Hyperthermia (ORPHA:466658).” However, they are determined as different diseases by our subject-matter experts, and the same statement has been made in the GARD page for “King-Denborough syndrome (GARD:0008433),” claiming that “King-Denborough syndrome is a congenital myopathy associated with susceptibility to malignant hyperthermia (GARD:0006964)” [<xref ref-type="bibr" rid="ref35">35</xref>]. Such discrepancies occurring across different resources unveiled from this study illustrate that there is an urgent need to propose a standard protocol for guiding data harmonization in the rare disease field globally. Regardless of phenotypical similarity types, the process our subject-matter experts took in the evaluation step is crucial to re-evaluate rare diseases with the latest prevalence data, which is one critical step to determine their eligibility of RARE. For instance, there are more than 200,000 individuals in the United States who are affected with familial Alzheimer disease (GARD:0000632) [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]; thus, the prevalence rate of this disease does not meet the criteria of the United States’ rare disease definition, so it will be retired from the GARD database.</p>
      <sec>
        <title>Conclusion</title>
        <p>In this paper, we report our recent effort at identifying phenotypical similarity among rare diseases by leveraging disease mappings among various resources and disease classifications. This effort will not only direct further GARD data harmonization but will also highlight the value of cross-resource collaboration. We propose to extend this work with more rare disease resources at the NIH or outside the NIH for the improved assembly of information for rare diseases in order to better disseminate information to patients and health care providers.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">GARD</term>
          <def>
            <p>genetic and rare diseases</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">HPO</term>
          <def>
            <p>Human Phenotype Ontology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MONDO</term>
          <def>
            <p>Monarch Disease Ontology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">OMIM</term>
          <def>
            <p>Online Mendelian Inheritance in Man</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">UMLS</term>
          <def>
            <p>Unified Medical Language System</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research was supported by the Intramural research program of the NCATS, NIH.</p>
      <p>The authors thank Tongan Zhao from the Division of Pre-Clinical Innovation (DPI) at NCATS, who developed the GARD Data Tree web tool ; Michelle Snyder, an operational program manager of the GARD from ICF International Inc, who supported the manual review and provided valuable discussion; and Jim Dickens, a program manager from the Office of Rare Diseases Research (ORDR) at NCATS, who helped to arranged discussion meetings and participated in valuable discussions.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>The work was conceived by QZ, who also designed and performed the experiments and wrote all the source code and the manuscript. TN supported data extraction from the Neo4j database and participated in the project discussion. GA and KH, as the subject matter experts, manually reviewed and evaluated the results and provided valuable insights. ES participated in the project discussion and helped with the manual review. AP, as the Director of the Office of Rare Diseases Research (ORDR) at NCATS, supported this work and participated in valuable discussions. All authors read, edited, and approved the final manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boat</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Field</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Rare diseases and orphan products: Accelerating research and development</source>
          <year>2011</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>National Academies Press</publisher-name>
          <fpage>A</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Groft</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Posada de la Paz</surname>
              <given-names>Manuel</given-names>
            </name>
          </person-group>
          <article-title>Rare Diseases: Joining Mainstream Research and Treatment Based on Reliable Epidemiological Data</article-title>
          <source>Adv Exp Med Biol</source>
          <year>2017</year>
          <volume>1031</volume>
          <fpage>3</fpage>
          <lpage>21</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-67144-4_1</pub-id>
          <pub-id pub-id-type="medline">29214563</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <source>RARE AND ULTRA-RARE DISEASES</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ir.alexion.com/static-files/e07be2fa-fb02-43d7-ad00-844e3c66e86f#:~:text=Rare%20and%20ultra%2Drare%20diseases%2C%20often%20referred%20to%20as%20orphan,patients%20per%20million%20of%20population)">https://bit.ly/2SbQifn</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <source>The Golbal Challenge of Rare Disease Diagnosis</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://sc8-cms-shire-com.shirecontent.com/-/media/shire/shireglobal/shirecom/pdffiles/patient/shire-diagnosis-initiative-hcp-leaflet.pdf">https://sc8-cms-shire-com.shirecontent.com/-/media/shire/shireglobal/shirecom/pdffiles/patient/shire-diagnosis-initiative-hcp-leaflet.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <source>Genetic and Rare Disease Information Center</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://rarediseases.info.nih.gov/">https://rarediseases.info.nih.gov/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weinreich</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mangon</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sikkens</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Teeuw</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cornel</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>[Orphanet: a European database for rare diseases]</article-title>
          <source>Ned Tijdschr Geneeskd</source>
          <year>2008</year>
          <month>03</month>
          <day>01</day>
          <volume>152</volume>
          <issue>9</issue>
          <fpage>518</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="medline">18389888</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <source>Mondo Disease Ontology</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://obofoundry.org/ontology/mondo.html">http://obofoundry.org/ontology/mondo.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haendel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vasilevsky</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Unni</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bologa</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rehm</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hamosh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Baynam</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Groza</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McMurry</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dawkins</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Rath</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Thaxon</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bocci</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Joachimiak</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Köhler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>PN</given-names>
            </name>
            <name name-style="western">
              <surname>Mungall</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Oprea</surname>
              <given-names>TI</given-names>
            </name>
          </person-group>
          <article-title>How many rare diseases are there?</article-title>
          <source>Nat Rev Drug Discov</source>
          <year>2019</year>
          <month>11</month>
          <day>5</day>
          <volume>19</volume>
          <issue>2</issue>
          <fpage>77</fpage>
          <lpage>78</lpage>
          <pub-id pub-id-type="doi">10.1038/d41573-019-00180-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pontikos</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Moghul</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Withington</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Blanco-Kelly</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Vulliamy</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Phenopolis: an open platform for harmonization and analysis of genetic and phenotypic data</article-title>
          <source>Bioinformatics</source>
          <year>2017</year>
          <volume>33</volume>
          <issue>15</issue>
          <fpage>a</fpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btx147</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Siu</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Lawler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Haussler</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Knoppers</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Lewin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vis</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Andre</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Banks</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Barrett</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Caldas</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Camargo</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Fitzgerald</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mattison</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Pao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Sellers</surname>
              <given-names>WR</given-names>
            </name>
            <name name-style="western">
              <surname>Sullivan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Teh</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Ward</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>ZenKlusen</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Sawyers</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Voest</surname>
              <given-names>EE</given-names>
            </name>
          </person-group>
          <article-title>Facilitating a culture of responsible and effective sharing of cancer genome data</article-title>
          <source>Nat Med</source>
          <year>2016</year>
          <month>05</month>
          <day>05</day>
          <volume>22</volume>
          <issue>5</issue>
          <fpage>464</fpage>
          <lpage>71</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27149219"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/nm.4089</pub-id>
          <pub-id pub-id-type="medline">27149219</pub-id>
          <pub-id pub-id-type="pii">nm.4089</pub-id>
          <pub-id pub-id-type="pmcid">PMC4995884</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boutros</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Ewing</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Ellrott</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Norman</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Dang</surname>
              <given-names>KK</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kellen</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Suver</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bare</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Stein</surname>
              <given-names>LD</given-names>
            </name>
            <name name-style="western">
              <surname>Spellman</surname>
              <given-names>PT</given-names>
            </name>
            <name name-style="western">
              <surname>Stolovitzky</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Friend</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Margolin</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Stuart</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Global optimization of somatic variant identification in cancer genomes with a global community challenge</article-title>
          <source>Nat Genet</source>
          <year>2014</year>
          <month>3</month>
          <day>27</day>
          <volume>46</volume>
          <issue>4</issue>
          <fpage>318</fpage>
          <lpage>319</lpage>
          <pub-id pub-id-type="doi">10.1038/ng.2932</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <source>About Orphanet</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.orpha.net/consor/cgi-bin/Education_AboutOrphanet.php?lng=EN">https://www.orpha.net/consor/cgi-bin/Education_AboutOrphanet.php?lng=EN</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <source>OMIM Frequently Asked Questions (FAQs)</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.omim.org/help/faq">https://www.omim.org/help/faq</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Giovannini</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nadworny</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Auer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Girardi</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Marchiori</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Cognitive DDx Assistant in Rare Diseases</article-title>
          <source>Annu Int Conf IEEE Eng Med Biol Soc</source>
          <year>2018</year>
          <month>07</month>
          <conf-name>2018 40th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)</conf-name>
          <conf-date>18-21 July 2018</conf-date>
          <conf-loc>Honolulu, HI, USA</conf-loc>
          <fpage>3244</fpage>
          <lpage>3247</lpage>
          <pub-id pub-id-type="doi">10.1109/EMBC.2018.8513041</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mei</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Improving rare disease classification using imperfect knowledge graph</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2019</year>
          <month>12</month>
          <day>5</day>
          <volume>19</volume>
          <issue>S5</issue>
          <pub-id pub-id-type="doi">10.1186/s12911-019-0938-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sosa</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Derry</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Brinton</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>A Literature-Based Knowledge Graph Embedding Method for Identifying Drug Repurposing Opportunities in Rare Diseases</article-title>
          <source>bioRxiv</source>
          <year>2019</year>
          <fpage>727925</fpage>
          <pub-id pub-id-type="doi">10.1142/9789811215636_0041</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <source>Disease oriented integrative knowledge graph</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://disease.ncats.io/browser/">https://disease.ncats.io/browser/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <source>Orphanet rare disease classification</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.orpha.net/consor/cgi-bin/Disease_Classif.php?lng=EN">https://www.orpha.net/consor/cgi-bin/Disease_Classif.php?lng=EN</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <source>Procedural document: Orphanet nomenclature and classification of rare diseases</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.orpha.net/orphacom/cahiers/docs/GB/eproc_disease_inventory_R1_Nom_Dis_EP_04.pdf">https://www.orpha.net/orphacom/cahiers/docs/GB/eproc_disease_inventory_R1_Nom_Dis_EP_04.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hamosh</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Online Mendelian Inheritance in Man (OMIM), a knowledgebase of human genes and genetic disorders</article-title>
          <source>Nucleic Acids Research</source>
          <year>2004</year>
          <month>12</month>
          <day>17</day>
          <volume>33</volume>
          <issue>Database issue</issue>
          <fpage>D514</fpage>
          <lpage>D517</lpage>
          <pub-id pub-id-type="doi">10.1093/nar/gki033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mundlos</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The human phenotype ontology</article-title>
          <source>Clinical genetics</source>
          <year>2010</year>
          <volume>77</volume>
          <issue>6</issue>
          <fpage>525</fpage>
          <lpage>34</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1399-0004.2010.01436.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bodenreider</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>The Unified Medical Language System (UMLS): integrating biomedical terminology</article-title>
          <source>Nucleic Acids Research</source>
          <year>2004</year>
          <month>01</month>
          <day>01</day>
          <volume>32</volume>
          <issue>90001</issue>
          <fpage>267D</fpage>
          <lpage>270</lpage>
          <pub-id pub-id-type="doi">10.1093/nar/gkh061</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <source>Cypher Query Language</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://neo4j.com/developer/cypher-query-language/">https://neo4j.com/developer/cypher-query-language/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <source>Cosine Similarity</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://en.wikipedia.org/wiki/Cosine_similarity">https://en.wikipedia.org/wiki/Cosine_similarity</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <source>Cancer Stat Facts: Testicular Cancer From NCI SEER</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://seer.cancer.gov/statfacts/html/testis.html">https://seer.cancer.gov/statfacts/html/testis.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hoehndorf</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Schofield</surname>
              <given-names>PN</given-names>
            </name>
            <name name-style="western">
              <surname>Gkoutos</surname>
              <given-names>GV</given-names>
            </name>
          </person-group>
          <article-title>Analysis of the human diseasome using phenotype similarity between common, genetic, and infectious diseases</article-title>
          <source>Sci Rep</source>
          <year>2015</year>
          <month>06</month>
          <day>08</day>
          <volume>5</volume>
          <fpage>10888</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.doi.org/10.1038/srep10888"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/srep10888</pub-id>
          <pub-id pub-id-type="medline">26051359</pub-id>
          <pub-id pub-id-type="pii">srep10888</pub-id>
          <pub-id pub-id-type="pmcid">PMC4458913</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <source>GARD Data Tree</source>
          <access-date>2020-09-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://tripod.nih.gov/gardtree/">https://tripod.nih.gov/gardtree/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <source>FAQs About Rare Diseases</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://rarediseases.info.nih.gov/diseases/pages/31/faqs-about-rare-diseases">https://rarediseases.info.nih.gov/diseases/pages/31/faqs-about-rare-diseases</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <source>Spondylocostal dysostosis</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ghr.nlm.nih.gov/condition/spondylocostal-dysostosis">https://ghr.nlm.nih.gov/condition/spondylocostal-dysostosis</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <source>Spondylothoracic dysostosis</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ghr.nlm.nih.gov/condition/spondylothoracic-dysostosis">https://ghr.nlm.nih.gov/condition/spondylothoracic-dysostosis</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berdon</surname>
              <given-names>WE</given-names>
            </name>
            <name name-style="western">
              <surname>Lampl</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Cornier</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Ramirez</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Turnpenny</surname>
              <given-names>PD</given-names>
            </name>
            <name name-style="western">
              <surname>Vitale</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Seimon</surname>
              <given-names>LP</given-names>
            </name>
            <name name-style="western">
              <surname>Cowles</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Clinical and radiological distinction between spondylothoracic dysostosis (Lavy-Moseley syndrome) and spondylocostal dysostosis (Jarcho-Levin syndrome)</article-title>
          <source>Pediatr Radiol</source>
          <year>2010</year>
          <month>12</month>
          <day>22</day>
          <volume>41</volume>
          <issue>3</issue>
          <fpage>384</fpage>
          <lpage>388</lpage>
          <pub-id pub-id-type="doi">10.1007/s00247-010-1928-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <source>Hunter-Carpenter-McDonald syndrome</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bit.ly/3cLGC51">https://bit.ly/3cLGC51</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Piñero</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bravo</surname>
              <given-names>�</given-names>
            </name>
            <name name-style="western">
              <surname>Queralt-Rosinach</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gutiérrez-Sacristán</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Deu-Pons</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Centeno</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>García-García</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sanz</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Furlong</surname>
              <given-names>LI</given-names>
            </name>
          </person-group>
          <article-title>DisGeNET: a comprehensive platform integrating information on human disease-associated genes and variants</article-title>
          <source>Nucleic Acids Res</source>
          <year>2016</year>
          <month>10</month>
          <day>19</day>
          <volume>45</volume>
          <issue>D1</issue>
          <fpage>D833</fpage>
          <lpage>D839</lpage>
          <pub-id pub-id-type="doi">10.1093/nar/gkw943</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Landrum</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Benson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chitipiralla</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hart</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hoover</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ovetsky</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sethi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tully</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Villamarin-Salomon</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rubinstein</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Maglott</surname>
              <given-names>DR</given-names>
            </name>
          </person-group>
          <article-title>ClinVar: public archive of interpretations of clinically relevant variants</article-title>
          <source>Nucleic Acids Res</source>
          <year>2015</year>
          <month>11</month>
          <day>17</day>
          <volume>44</volume>
          <issue>D1</issue>
          <fpage>D862</fpage>
          <lpage>D868</lpage>
          <pub-id pub-id-type="doi">10.1093/nar/gkv1222</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <source>King Denborough syndrome</source>
          <access-date>2020-09-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://rarediseases.info.nih.gov/diseases/8433/king-denborough-syndrome/cases/22394#:~:text=The%20King%2DDenborough%20syndrome%20(KDS,features%20with%20characteristic%20facial%20appearance">https://bit.ly/36iRohM</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mayeux</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Stern</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Epidemiology of Alzheimer Disease</article-title>
          <source>Cold Spring Harbor Perspectives in Medicine</source>
          <year>2012</year>
          <month>04</month>
          <day>10</day>
          <volume>2</volume>
          <issue>8</issue>
          <fpage>a006239</fpage>
          <lpage>a006239</lpage>
          <pub-id pub-id-type="doi">10.1101/cshperspect.a006239</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bird</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Alzheimer disease overview</source>
          <year>2018</year>
          <publisher-loc>GeneReviews®Internet</publisher-loc>
          <publisher-name>University of Washington, Seattle</publisher-name>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
