<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i12e42379</article-id>
      <article-id pub-id-type="pmid">36534446</article-id>
      <article-id pub-id-type="doi">10.2196/42379</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Construction of Cohorts of Similar Patients From Automatic Extraction of Medical Concepts: Phenotype Extraction Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Xiong</surname>
            <given-names>Ying </given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Candeliere</surname>
            <given-names>Jasmine</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gaudet-Blavignac</surname>
            <given-names>Christophe</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Gérardin</surname>
            <given-names>Christel</given-names>
          </name>
          <degrees>MA, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Institute Pierre Louis Epidemiology and Public Health</institution>
            <institution>Institut National de la Santé et de la Recherche Médicale, Sorbonne Université</institution>
            <addr-line>27 rue de Chaligny</addr-line>
            <addr-line>Paris, 75012</addr-line>
            <country>France</country>
            <phone>33 678148466</phone>
            <email>christel.ducroz-gerardin@iplesp.upmc.fr</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9303-6349</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Mageau</surname>
            <given-names>Arthur</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2995-767X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Mékinian</surname>
            <given-names>Arsène</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2849-3049</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Tannier</surname>
            <given-names>Xavier</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2452-8868</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Carrat</surname>
            <given-names>Fabrice</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8672-7918</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Institute Pierre Louis Epidemiology and Public Health</institution>
        <institution>Institut National de la Santé et de la Recherche Médicale, Sorbonne Université</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Institut National de la Santé et de la Recherche Médicale</institution>
        <institution>Unité Mixte de Recherche 1137  Infection Antimicrobials Modelling Evolution, Team  Decision Sciences in Infectious Diseases</institution>
        <institution>Université Paris Cité</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Service de Médecine Interne, Inflammation-Immunopathology-Biotherapy Department</institution>
        <institution>Hôpital Saint-Antoine, Sorbonne Université</institution>
        <institution>Assistance Publique–Hôpitaux de Paris</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Laboratoire d'Informatique Médicale et d'Ingénierie des Connaissances pour la e-Santé</institution>
        <institution>Institut National de la Santé et de la Recherche Médicale, Université Sorbonne</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Public Health Department</institution>
        <institution>Hopital Saint-Antoine</institution>
        <institution>Assistance Publique–Hôpitaux de Paris</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Christel Gérardin <email>christel.ducroz-gerardin@iplesp.upmc.fr</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>12</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>19</day>
        <month>12</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>12</issue>
      <elocation-id>e42379</elocation-id>
      <history>
        <date date-type="received">
          <day>1</day>
          <month>9</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>9</day>
          <month>10</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>17</day>
          <month>10</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>22</day>
          <month>10</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Christel Gérardin, Arthur Mageau, Arsène Mékinian, Xavier Tannier, Fabrice Carrat. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 19.12.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/12/e42379" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Reliable and interpretable automatic extraction of clinical phenotypes from large electronic medical record databases remains a challenge, especially in a language other than English.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aimed to provide an automated end-to-end extraction of cohorts of similar patients from electronic health records for systemic diseases.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Our multistep algorithm includes a named-entity recognition step, a multilabel classification using medical subject headings ontology, and the computation of patient similarity. A selection of cohorts of similar patients on a priori annotated phenotypes was performed. Six phenotypes were selected for their clinical significance: P1, osteoporosis; P2, nephritis in systemic erythematosus lupus; P3, interstitial lung disease in systemic sclerosis; P4, lung infection; P5, obstetric antiphospholipid syndrome; and P6, Takayasu arteritis. We used a training set of 151 clinical notes and an independent validation set of 256 clinical notes, with annotated phenotypes, both extracted from the Assistance Publique-Hôpitaux de Paris data warehouse. We evaluated the precision of the 3 patients closest to the index patient for each phenotype with precision-at-3 and recall and average precision.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>For P1-P4, the precision-at-3 ranged from 0.85 (95% CI 0.75-0.95) to 0.99 (95% CI 0.98-1), the recall ranged from 0.53 (95% CI 0.50-0.55) to 0.83 (95% CI 0.81-0.84), and the average precision ranged from 0.58 (95% CI 0.54-0.62) to 0.88 (95% CI 0.85-0.90). P5-P6 phenotypes could not be analyzed due to the limited number of phenotypes.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Using a method close to clinical reasoning, we built a scalable and interpretable end-to-end algorithm for extracting cohorts of similar patients.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>similar patient cohort</kwd>
        <kwd>phenotype</kwd>
        <kwd>systemic disease</kwd>
        <kwd>NLP</kwd>
        <kwd>algorithm</kwd>
        <kwd>automatic extraction</kwd>
        <kwd>automated extraction</kwd>
        <kwd>named entity</kwd>
        <kwd>MeSH</kwd>
        <kwd>medical subject heading</kwd>
        <kwd>data extraction</kwd>
        <kwd>text extraction</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Extracting clinical phenotypes from large electronic health record (EHR) databases, also known as clinical data warehouses, is a key step for several medical applications from epidemiological research [<xref ref-type="bibr" rid="ref1">1</xref>] to prognosis prediction [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>] and therapeutic decision support [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Reliable automatic extraction of patient phenotypes from large EHR databases remains a challenge, especially in languages other than English [<xref ref-type="bibr" rid="ref6">6</xref>]. The actual identification of patients’ phenotypes is still largely done via the International Classification of Diseases, Ninth/Tenth Revision (ICD-9/ICD-10) code extraction, reading of clinical notes, or extraction of entities via regular expressions. However, as shown by Farzandipour et al [<xref ref-type="bibr" rid="ref7">7</xref>] on more than 300 EHR ICD-10 codes, 22.7% presented errors in principal diagnosis codes, of which 33.3% were major errors. Benkhaial et al [<xref ref-type="bibr" rid="ref8">8</xref>] also showed in a study of 200 patients, ICD allergy codes were present for 18 patients, while 51 had allergy information in a written note, indicating that only 35% of the allergies were correctly coded. These identification methods thus lack precision and require important human control.</p>
        <p>With the improvement of natural language processing over the last 10 years, new language models such as Word2vec [<xref ref-type="bibr" rid="ref9">9</xref>], GloVe [<xref ref-type="bibr" rid="ref10">10</xref>], FastText [<xref ref-type="bibr" rid="ref11">11</xref>] and, more recently, Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref12">12</xref>] have allowed significant progress for various natural language processing tasks such as translation, question-answering, and named-entity recognition via an efficient word representation. Named-entity recognition corresponds to the extraction of certain classes of entities in a raw text. In the medical domain, it can be “signs and symptoms,” “disorders,” “chemicals and drugs,” etc.</p>
        <p>Many research teams have developed new algorithms based on these word models to allow automatic patient phenotyping. De Freitas et al [<xref ref-type="bibr" rid="ref13">13</xref>] proposed Phe2vec, a data-driven, unsupervised disease phenotyping algorithm. In their study, disease phenotypes correspond to the word representation of ICD-10 core concepts (or seed concepts) and their closest neighbors. A patient’s clinical history is summarized by aggregating all the word vector representations of the medical concepts. Mapping a patient to a disease is then done by computing a cosine distance between the patient with each disease phenotype. In their method, the medical concept extraction step from clinical notes is performed based on 1 ontology [<xref ref-type="bibr" rid="ref14">14</xref>]. Ferté et al [<xref ref-type="bibr" rid="ref15">15</xref>] also proposed an algorithm for automatic phenotyping of EHRs by using ICD-10 codes and a dictionary-based entity recognition tool to extract interesting terms from clinical notes. Extracted terms were then mapped to their unified medical language system concept unique identifier as a feature for classification to provide an interpretable parametric predictor. Their work showed particularly interesting results for chronic conditions.</p>
        <p>In this work, we extracted similar patients by focusing on 4 systemic diseases as a proof of concept: systemic lupus erythematosus (SLE), systemic sclerosis, antiphospholipid syndrome (APS), and Takayasu arteritis. SLE is an autoimmune disease that can affect a large number of organs: the skin (specific malar rash, photosensitivity, etc), kidneys (nephrotic syndrome and glomerular nephropathy), joints (most often without deformation), brain (with neuropsychiatric forms), etc. It is a rare disease that affects 41 in 100,000 people in France [<xref ref-type="bibr" rid="ref16">16</xref>], and 9 women for 1 man in generally young (18-30 years old) adults. Systemic sclerosis can also involve various organs: the skin (sclerosis leading to significant functional impotence), the lungs (interstitial lung disease [ILD], fibrosis, and hypertension), the digestive system (reflux and chronic intestinal obstruction), etc. Its frequency is 1/5000 in France, and it preferentially affects women (4 women for 1 man) aged between 40 and 50 years. APS is a disease that causes venous and arterial thrombosis as well as obstetrical complications. Approximately 20%-30% of patients with lupus develop APS. Its frequency is approximately 1 in 12,000 [<xref ref-type="bibr" rid="ref16">16</xref>]. Takayasu arteritis is an inflammatory disease that affects large vessels in young people. It is a very rare disease affecting 1.2 to 2.6 cases/million/year in France. It affects 4.8 women for 1 man between 20 and 40 years of age [<xref ref-type="bibr" rid="ref17">17</xref>]. These 4 diseases were chosen because of their large spectrum of signs and symptoms and their similarity (especially for lupus and APS in terms of apparition frequency and APS and Takayasu for their arterial manifestations).</p>
      </sec>
      <sec>
        <title>Goal of This Study</title>
        <p>In this study, we aimed to develop an automated end-to-end extraction of similar patient cohorts from electronic medical records. Specifically, we place ourselves in the following use case: we have a patient to treat with clinical information in a text document (mentioned as index patient in this paper), and we automatically search for the set of patients with similar symptoms and diseases mentioned in their hospitalization reports. To evaluate our method, we extracted cohorts of similar patients from index patients with certain phenotypes described in their textual reports, arbitrarily selected, and manually annotated by a clinician. Our main contribution in this paper is the development of an algorithm for the automatic construction of similar patient cohorts by a method close to clinical reasoning, as we argue in the Discussion section.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Algorithm Steps</title>
        <p>In this section, we detail the main steps of our algorithm. Similarity is defined here as a patient with identical or closely related signs, symptoms, and disorders. The key steps for extracting these events from the text are a named-entity recognition step to extract medical concepts, a multilabel classification on each extracted term, and an average distance computation on an appropriate representation of all the terms on each label. We validated our interpatient distance by clustering 6 a priori defined phenotypes of interest: osteoporosis, nephritis in SLE, ILD in systemic sclerosis, lung infection, obstetric APS, and Takayasu arteritis. With the same interpatient distance, we then constructed similarity cohorts from index patients for each of these phenotypes.</p>
      </sec>
      <sec>
        <title>Overview of the Algorithm</title>
        <p>For readability, in the remainder of this paper, we use the term “patient” to refer to the “hospitalization report related to the patient.”</p>
        <p>The main steps of the algorithms are shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, considering an index patient:</p>
        <list list-type="order">
          <list-item>
            <p>Symptoms and diseases were extracted from a raw text while filtering out all negated, hypothetical, and belonging to family terms.</p>
          </list-item>
          <list-item>
            <p>All extracted terms were classified into broad organ categories, that is, cardiovascular, immune, ophthalmologic, digestive, etc, by a multilabel classification step using our previously developed algorithm [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
          </list-item>
          <list-item>
            <p>A vector (embedding) representation for all extracted terms was obtained leading to the index patient representation.</p>
          </list-item>
          <list-item>
            <p>From this representation for other patients, the distance for each label of the index patient to the other patients was computed. Then, the average of the distances of all the labels was determined.</p>
          </list-item>
          <list-item>
            <p>A cohort of similar patients was built from the patients closest to the index patient for each annotated phenotype.</p>
          </list-item>
        </list>
        <p>We will refer to this patient’s hospitalization report (<xref rid="figure1" ref-type="fig">Figure 1</xref>, index_patient) as a running example throughout the steps described below.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Overview of the algorithm to obtain a representation of the patients’ electronic health records and to compute a distance from other patients’ electronic health records. First, a named-entity recognition step is performed on a patient's electronic health record (to extract symptoms and disorders and filter all negated, hypothetical, and someone else’s terms). Second, a multilabel classification step is performed for each extracted term to allow more clinical interpretation. Third, this leads to an electronic health record model containing all the extracted terms with their respective labels and embedding representations (last column of the model). Fourth, this allows a distance computation on each of the 22 labels (Dnervous corresponds to the distance between embeddings of all terms labelled nervous, Dimmune on the immune label, etc). Fifth, a similarity cohort computation is performed. EHR: Electronic Health Record.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Sets and Annotation Rules</title>
        <p>The data set of this study was obtained from the Assistance Publique-Hôpitaux de Paris (AP-HP) data warehouse. Patients were informed that their EHR information could be reused after an anonymization process, and those who objected to the reuse of their data were excluded. All methods were carried out in accordance with relevant guidelines (reference methodology MR-004 of the Commission Nationale de l’Informatique et des Libertés [<xref ref-type="bibr" rid="ref19">19</xref>]).</p>
        <p>The data set contained all hospitalization reports, consultation reports, test results, prescriptions, etc of all patients older than 15 years with lupus, scleroderma, APS, and Takayasu arteritis who made at least one visit to AP-HP hospitals since 2017. The data set constitutes a set of 2 million pseudonymized clinical records. It was extracted using only the ICD-10 codes of the principal diagnosis for lupus (M320, M321, M328, M329, L930, L931, corresponding to 5176 patients), systemic sclerosis (M340, M341, M348, M349, corresponding to 2833 patients), APS (D686 corresponding to 1250 patients), and Takayasu arteritis (M314, corresponding to 287 patients).</p>
        <p>An internist physician annotated a training subset of 151 clinical notes (40 lupus, 35 APS, 37 systemic sclerosis, and 39 Takayasu) with symptoms or disorders by using specific attributes “negated,” “hypothetical,” and “belonging to family” when relevant. Guided by a clinical logic, we chose not only to annotate the negated terms as negation (eg, no fever, no diabetes) but also all the physiological descriptions (eg, peripheral pulse present, vesicular breath sounds present and symmetrical, regular heart sounds). All of these physiological findings were annotated as negative, because in clinical reasoning, we focus primarily on pathological signs. We adopted this approach also because the language models we use are able to capture both the syntactic and semantic levels of language. The medical subject heading (MeSH) category C [<xref ref-type="bibr" rid="ref20">20</xref>] head chapters (eg, cardiovascular, immune, digestive) were also annotated at the entity level. This annotated data set was used to train both the named-entity recognition step with the symptoms and disorders labels and the multilabel classification step with MeSH [<xref ref-type="bibr" rid="ref20">20</xref>] category C chapter head labels. Another test set of 256 hospitalization reports was annotated with one or more of the 6 phenotypes of interest, that is, osteoporosis, nephritis in SLE, ILD in systemic sclerosis, lung infection, obstetric APS, and Takayasu arteritis by another internist physician with no common patients between the training and testing data sets.</p>
        <p>The annotation rules were defined before starting. First, a phenotype was only positively annotated if it was explicitly written, and no interpretation was made of signs and test results to guess the phenotype. For example, for osteoporosis, neither bone mineral density nor the number of vertebral fractures was interpreted, and the only terms retained positively were osteoporosis and corticosteroid-induced osteoporosis. Detailed examples can be found in Figure S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We selected these phenotypes for their clinical significance both in the 4 pathologies of interest studied and globally in terms of osteoporosis and lung infection phenotypes. These phenotypes were selected as an example, but our algorithm can be generalized to handle very different phenotypes.</p>
      </sec>
      <sec>
        <title>Word Representations</title>
        <p>Two word representation models were used for this work. First, a French BERT model [<xref ref-type="bibr" rid="ref12">12</xref>], camemBERT, trained by Martin et al [<xref ref-type="bibr" rid="ref21">21</xref>] on a wide variety of French documents was used for the named-entity recognition and multilabel classification steps. Second, a FastText model developed by Bojanowski et al [<xref ref-type="bibr" rid="ref11">11</xref>] was used for the patient model to calculate the interpatient distance. Both methods convert words into vectors of real numbers (called embeddings). BERT produces embeddings that take into account the context (other words in the phase), while FastText produces fixed embeddings (a word corresponds to a vector independently of the surrounding text). For our study, we had 2 million documents of all types (consultation records, hospitalization records, discharge summaries, etc), which correspond to a volume of 5 gigabytes of text. These data allowed us to train the FastText model from scratch. The camemBERT model was too large to train from scratch, but we fine-tuned it on our data, that is, we retrained its final layers. As a result, it was able to learn a context-appropriate vector representation (particularly effective for the feature extraction step 1); nevertheless, its initial vocabulary did not contain all the medical concepts, unlike the FastText model, which we used for the patient representation for the interpatient distance calculation.</p>
      </sec>
      <sec>
        <title>Named-entity Recognition</title>
        <p>This first step enables us to extract positive symptoms (pathologic signs) and disorders, filtering all terms corresponding to hypothetical, negated, and family-related elements. For instance, in <xref rid="figure1" ref-type="fig">Figure 1</xref> (index_patient), the extracted terms were “parietal focal status epilepticus,” “frontoparietal hematoma,” and “systemic lupus erythematosus,” whereas “angioedema” was not kept since it was only hypothetical. The algorithm used for this first step is based on an encoder (with BERT layers) and a bidirectional long short-term memory decoder. This neural named-entity recognition model, described in [<xref ref-type="bibr" rid="ref18">18</xref>], obtains an exact F-measurement of 0.931 on the English CoNLL data set [<xref ref-type="bibr" rid="ref22">22</xref>], using the BERT-large embeddings [<xref ref-type="bibr" rid="ref12">12</xref>], and 0.784 on GENIA [<xref ref-type="bibr" rid="ref23">23</xref>], using the BioBERT-large model [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
      </sec>
      <sec>
        <title>Multilabel Classification</title>
        <p>To improve clinical interpretability and to analyze patients along several medical dimensions (ie, labels), we chose to perform a multilabel classification of all the terms. The corresponding class is all the MeSH-C head chapters, corresponding to 22 medical fields: infections, ophthalmologic, stomatology, cardiovascular, digestive, respiratory, nervous, etc. A BERT model for the sequence classification was used and trained on all annotated entities and all MeSH terms and their synonyms. Synonyms of MeSH terms were obtained by extracting all the French terms sharing the same code unique identifier in the unified medical language system defined by their authors as a “set of files and software that brings together many health and biomedical vocabularies to enable interoperability between computer systems” [<xref ref-type="bibr" rid="ref25">25</xref>]. This multilabel classifier has been described in our previous study and evaluated on an external challenge with an F1-score from 0.809 to 0.811 depending on the model used [<xref ref-type="bibr" rid="ref18">18</xref>]. For instance, for our index_patient in <xref rid="figure1" ref-type="fig">Figure 1</xref>, parietal focal status epilepticus is labelled as nervous, and systemic lupus erythematosus is labelled as immune and skin.</p>
      </sec>
      <sec>
        <title>Distance Computation</title>
        <p>We used FastText to obtain an embedding representation of each extracted term. With all the patients represented as a list of embeddings for each label, the distance between the patients can be computed based on one particular label of interest (cardiovascular, urogenital, etc), or several, or all. However, 2 patient records may contain different numbers of terms (ie, vectors) per label. For example, index_patient on <xref rid="figure1" ref-type="fig">Figure 1</xref> only presents 1 term on the cardiovascular label (lupus pericarditis), whereas patient_2 may present many cardiovascular terms such as coronary syndrome, hypertension, and stroke.</p>
        <p>Following Kusner et al’s [<xref ref-type="bibr" rid="ref26">26</xref>] idea, we decided to use the earth mover’s distance, a distance that minimizes the cost to be paid to transform one distribution into another. We compute this distance for each label. In our case, the distributions correspond to the set of terms per label, and each term corresponds to a point. The size of the point corresponds to the frequency of occurrence of the term, and the distance between the points corresponds to the cosine distance between the FastText embeddings of the terms. In our example, the immune label for index_patient is made of the terms SLE (1 occurrence), Raynaud (1 occurrence), Gougerot-Sjögren (1 occurrence), and lupus pericarditis (1 occurrence).</p>
        <p>Having a distance, we are now able to compare patients’ clinical notes on each label (provided that the patient’s record has at least one term present for this label) or globally. To compare 2 patients globally, we summed the earth mover’s distances of the 2 patients across each label and weighted them with the corresponding number of terms for each label. Equations (1) and (2) below specify the weighting term, where HR<sub>1</sub> and HR<sub>2</sub> denote 2 different hospitalization reports, and EMD() denotes the earth mover’s distance between the 2 notes for a specific label i.</p>
        <disp-formula>
        D(HR<sub>1</sub>, HR<sub>2</sub>) = (1/nlabels)*Σ (λ<sub>i</sub> EMD(HR<sub>1</sub>(label<sub>i</sub>), HR<sub>2</sub> (label<sub>i</sub>)) <bold>(1)</bold>
        </disp-formula>
        <disp-formula>
        with λ<sub>i</sub> = (nHR<sub>1</sub>(label<sub>i</sub>) + nHR<sub>2</sub>(label<sub>j</sub>)) / (nHR<sub>1</sub> + nHR<sub>2</sub>) <bold>(2)</bold>
        </disp-formula>
        <p>where HR<sub>j</sub>(label<sub>j</sub>) is the list of terms from HR<sub>i</sub> involving label<sub>j</sub> and nHR is the number of terms in the term subset HR.</p>
      </sec>
      <sec>
        <title>Evaluation</title>
        <p>We evaluate our approach with the 6 use cases described earlier, each being associated with specific MeSH-C labels. For example, to obtain similar patients for the osteoporosis phenotype (labelled musculoskeletal and nutritional according to MeSH classification), we computed the earth mover’s distance of the hospitalization reports only on these 2 labels. Similarly, for ILD in systemic sclerosis, we focused on the respiratory and immune labels. For lung infection, we focused on the respiratory and infections labels, and so on. However, our algorithms can be applied to any new use case and to any set of MeSH-C labels.</p>
        <sec>
          <title>Clustering</title>
          <p>To visualize our results and to confirm the relevance of our approaches, we performed an unsupervised hierarchical clustering of all patients in the training data set on each label and globally, checking if patients with similar phenotypes belonged to the same clusters. We used agglomerative hierarchical clustering (each hospitalization report is initialized as a singleton cluster, and clusters are merged two-by-two) with Ward’s criterion, which minimizes the variance of the clusters. The same method was used for our 6 use case phenotypes. We used the SciPy library [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
        </sec>
        <sec>
          <title>Selection of a Cohort of Similar Patients From an Index Patient</title>
          <p>We approach the problem of building a cohort of similar patients as an information retrieval problem, where the patient’s document (index patient) is a query. We then evaluate the ability of the system to return a ranked list of documents, with the most relevant/similar at the top of the list. <xref rid="figure2" ref-type="fig">Figure 2</xref> gives an overview of this selection on the example of a patient with the phenotype “Nephritis in SLE.” We evaluate the precision-at-k (percentage of correct phenotype prediction in the first k closest documents of distinct patients), the recall (percentage of all correct phenotypes that are selected in the first n closest patients, n being the number of patients in each phenotype), and the average precision. The average precision computes the average value of the precision for recall values over 0 to 1. It considers the order in which the patients are selected and corresponds to an estimate of the area under the precision-recall curve. For each phenotype, each patient from the test set is chosen in turn as an index patient, and the final results are an average over all patients. Confidence intervals were calculated using the normal distribution approximation.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Example of document selection for the phenotype "Nephritis in systemic lupus erythematosus." First, from the clinical observation of the index patient, symptoms and diseases are extracted and classified according to medical subject heading-C chapter headings (step 1). Then, the distance is calculated on the UroGen and immune classes (specifically for this phenotype, step 2). Finally, the closest documents are those with the same written phenotype, corresponding to the patients in red in the figure, leading to a ranked list of the closest documents of distinct patients (step 3). SLE: Systemic lupus erythematosus; HR: Hospitalization report.</p>
            </caption>
            <graphic xlink:href="medinform_v10i12e42379_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Visualization</title>
          <p>A distance-based search result was also constructed to select the most similar patient to an index patient, with clickable labels where clinicians can choose any labels of interest they want to select (as in our phenotype examples). This search result returns the most similar patients on the selected labels in the descending order of similarity. A demonstration can be found in this following link [<xref ref-type="bibr" rid="ref28">28</xref>], with 4 use cases with word clouds of medical terms enabling the similarity decision. All our codes are available on GitHub [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Ethics Approval</title>
        <p>The results shown in this study are derived from the analysis of the AP-HP data warehouse. This study and its experimental protocol was approved by the AP-HP Scientific and Ethical Committee (IRB00011591 decision CSE 20-0093). All methods were carried out in accordance with relevant guidelines (reference methodology MR-004 of the Commission Nationale de l’Informatique et des Libertés [<xref ref-type="bibr" rid="ref19">19</xref>]). All medical records have been pseudonymized. Patients are informed by the AP-HP data warehouse that the data are pseudonymized and that they can object to their sharing. Their consent was therefore collected prior to our study.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Clustering</title>
        <p>The results of the unsupervised hierarchical clustering on our training data set of 151 EHRs are shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, <xref rid="figure4" ref-type="fig">Figure 4</xref>, and <xref rid="figure5" ref-type="fig">Figure 5</xref>. Each cluster is enhanced with its corresponding word cloud (highlighting the frequencies of occurrence of terms within each cluster). Interestingly, on the immune label (<xref rid="figure3" ref-type="fig">Figure 3</xref>), we were able to properly separate patients with scleroderma (left, orange cluster) from patients with lupus or lupus with APS (green clusters). As mentioned earlier, 30% of APS is secondary to systemic lupus, and indeed, several patients with APS in our data set also had lupus. Similarly, on the digestive label (<xref rid="figure4" ref-type="fig">Figure 4</xref>), we were able to separate upper digestive manifestations (left cluster) from liver issues (left clusters). With regard to the global clustering (using equations 1 and 2 above), we obtained 4 different clusters, as shown in <xref rid="figure5" ref-type="fig">Figure 5</xref>. Scleroderma is clustered separately with forms of cutaneous lupus (right, purple cluster) from lupus with thromboembolic manifestations and APS (middle, red cluster) from Takayasu (second left, green cluster). Interestingly, scleroderma with pulmonary arterial hypertension (left, little orange cluster) is close to the Takayasu cluster with arterial complications. The test set included 100 patients with lupus, 87 with scleroderma, 51 with APS, and 18 with Takayasu arteritis. Only 4 Takayasu stroke were labelled and 7 obstetrical APS, which did not allow us to perform clustering or other performance computations. The clustering results for phenotypes osteoporosis and lung infection with ground truth labelled documents are shown as examples in <xref rid="figure6" ref-type="fig">Figure 6</xref> and <xref rid="figure7" ref-type="fig">Figure 7</xref>, respectively.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Unsupervised hierarchical clustering based on electronic health record earth mover's distance on the “immune” label. Word clouds of electronic health records words are plotted on each respective cluster. Interestingly, patients with systemic scleroderma all belong to the same cluster (orange). Only patients who were labelled “immune” are clustered; we thus represent 129 patients out of 151.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Unsupervised hierarchical clustering based on earth mover's distance of electronic health records on the label “digestive.” The word cloud of the electronic health records is shown on each respective cluster. Interestingly, the left cluster reports upper digestive manifestations (oesophagitis, gastroesophageal reflux or RGO in French), and the rightmost cluster represents patients with liver diseases (brown cluster: cytolysis, hepatitis, hepatic), whereas the middle cluster represents patients with both conditions. Only patients who were labelled digestive are clustered; we thus represent 89 patients out of 151.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Unsupervised ascending hierarchical clustering based on the overall earth mover's distance of the electronic health records from equations (1) and (2). Word clouds of term frequency in the electronic health records are plotted on each respective cluster.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Unsupervised ascending hierarchical clustering based on earth mover's distance of electronic health records on the “osteomuscular” and “nutritional” labels (derived from the medical subject heading classification); only patients having the labels “osteomuscular” and “nutritional” are represented here (corresponding to 119 patients, not 256). All patients with osteoporosis were labelled “OSTEO” in the orange cluster. Other patients present in this cluster without explicitly written osteoporosis present “osteopenia” (all 4 first patients) of several vertebral fractures.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Unsupervised ascending hierarchical clustering based on earth mover's distance of electronic health records on the respiratory and infection axes (derived from the medical subject heading classification). All patients with lung infections were labelled “LUNG_INF” in the green cluster. Some outliers may be noticed; on the very left, the patient had purulent pleurisy, and one had pulmonary tuberculosis. The remaining patients on the left of the green cluster all had other linked manifestations such as bronchitis, parainfluenza infection, and bronchoalveolar lavage positive for <italic>Klebsiella pneumoniae</italic> and oropharyngeal flora.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Selection of a Cohort of Similar Patients From an Index Patient</title>
        <p>The performance of cohort construction for the first 4 phenotypes is presented in <xref ref-type="table" rid="table1">Table 1</xref>. The last 2 phenotypes (P5-P6) could not be analyzed due to a limited number of phenotypes at the annotation stage (7 and 4, respectively).</p>
        <p>Overall, we obtained an average precision ranging from 0.58 to 0.88, precision@10 from 0.65 to 0.98, and recall from 0.53 to 0.83. However, the average precision was lower for P3 (ILD in systemic sclerosis) owing to the higher diversity of terms used to describe the lung condition, that is, fibrosis, ILD, scleroderma with pulmonary involvement, etc, and to the fact that the phenotype annotations were very specific. As an example, sclerodermatomyositis or mixed connective tissue disease with lung involvement, which are very close to this phenotype were not annotated positively. An error analysis with mention encountered on close patients can be found in Table S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. For the 4 phenotypes P1-P4, the precision-recall curves (means for all patients within each phenotype) were computed and are shown in Figure S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, which is another way of showing the average precision performances. We showed very good results for the P1-P2 and P4 phenotypes and satisfactory results for the P3 phenotype since the patients had to present exactly the same disease.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Performance results for phenotype similarity (mean and 95% CI) for all patients of a phenotype. For each phenotype, each patient in the test set is chosen in turn as an index patient, and the final results are an average of all patients.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>P1, osteoporosis (n=23)</td>
                <td>P2, nephritis in systemic lupus erythematosus (n=48)</td>
                <td>P3, interstitial lung disease in systemic sclerosis (n=20)</td>
                <td>P4, lung infections (n=33)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Precision@3<sup>a</sup></td>
                <td>0.97 (0.91-1.0)</td>
                <td>0.99 (0.98-1.0)</td>
                <td>0.85 (0.75-0.95)</td>
                <td>0.92 (0.84-0.99)</td>
              </tr>
              <tr valign="top">
                <td>Precision@10</td>
                <td>0.95 (0.91-0.99)</td>
                <td>0.98 (0.97-0.99)</td>
                <td>0.65 (0.58-0.72)</td>
                <td>0.86 (0.81-0.92)</td>
              </tr>
              <tr valign="top">
                <td>Average precision</td>
                <td>0.88 (0.85-0.90)</td>
                <td>0.85 (0.83-0.87)</td>
                <td>0.58 (0.54-0.62)</td>
                <td>0.72 (0.69-0.75)</td>
              </tr>
              <tr valign="top">
                <td>Recall<sup>b</sup></td>
                <td>0.83 (0.81-0.84)</td>
                <td>0.79 (0.77-0.80)</td>
                <td>0.53 (0.50-0.55)</td>
                <td>0.66 (0.64-0.68)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Precision@3 patients (precision@10) is presented, which represents the obtained precision calculated on the 3 (or 10) patients closest to the index patient (ie, with the minimum distance).</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Recall is the recall calculated for all patients to be found with the same phenotype (ie, recall calculated on the 23 closest patients for osteoporosis, the 48 closest patients for nephritis in systemic lupus erythematosus, etc). Precision-recall curves for the 4 phenotypes are shown in Figure S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Visualization</title>
        <p>As an illustration, <xref rid="figure8" ref-type="fig">Figures 8</xref> and <xref rid="figure9" ref-type="fig">9</xref> below show the search results described earlier for a patient with ILD in systemic sclerosis and nephritis in SLE, respectively. We see that for an index patient with ILD in systemic sclerosis (<xref rid="figure8" ref-type="fig">Figure 8</xref>), choosing the immune and respiratory labels led to the finding of 10 patients out of the 15 first, having the same condition. Interestingly, among these 15 samples, the 5 unlabeled patients had a disease very close to the expected one: “ILD evolving to fibrosis” and a “mixed connective tissue disease” for the first one (note_98, rank 4) and “sclerodermatomyositis” and “interstitial lung disease” for the second (note_182, rank 5). Further analysis of the errors is presented in Table S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. A more extensive error analysis can be found in Table S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. <xref rid="figure9" ref-type="fig">Figure 9</xref> shows the search results for an index patient with nephritis in SLE. All the 21st closest patients on labels “immune” and “urogenital” showed nephritis in SLE.</p>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Search results of an index patient with interstitial lung disease; the darker the color is, the closer the patients are to that particular label. Here, the selected labels “immune” and “respiratory” in 8 of the 10 first patients are labelled with “PINS_Sclerodermie” (in French, ie, interstitial lung disease in systemic sclerosis).</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure9" position="float">
          <label>Figure 9</label>
          <caption>
            <p>Search results of a patient with nephritis in systemic lupus erythematosus. The darker the color is, the closer the patients are to that particular label. Here, the selected labels “immune” and “urogenital” in all the 20 first closest patients are labelled with the right phenotype nephro_lupus.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e42379_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Summary</title>
        <p>In this study, we developed a novel end-to-end algorithm from raw clinical notes to cohort similarity extraction. We have shown that we can cluster very specific phenotypes on an annotated data set and build similarity cohorts with good mean average precision results. These phenotypes and diseases were chosen as a proof of concept, with 2 general phenotypes such as osteoporosis and lung infection and 2 very specific phenotypes with nephritis in SLE and ILD in scleroderma. However, our algorithm can be applied to other phenotypes or diseases as well. Furthermore, our system can be applied to any other data warehouse and does not contain any handcrafted rules. An interactive demo is available online [<xref ref-type="bibr" rid="ref28">28</xref>], and all our codes are available on GitHub [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
      </sec>
      <sec>
        <title>Advantages of Our Approach</title>
        <p>The main advantage of our approach is the proximity to clinical reasoning—the named-entity recognition step focusing on the distinction between physiological and pathological signs and the observations of the patients on the 22 main medical domains (cardiovascular, pulmonary, hemic, immune)—thereby allowing clinicians to choose on which aspect patients should be similar. This analysis provides interpretable results to clinicians as well as high modularity, which is essential in the field of therapeutic decision support. In clinical practice, this algorithm would enable the physician to automatically extract similar patients, evaluate their clinical evolution, and extrapolate them to the patient they want to treat. Our algorithm focuses on 1 patient’s hospitalization report rather than on the entire patient’s record (EHR), as we want to extract patients with similar conditions and similar acute complications at a time. This algorithm is also able to compare along very fine-grained characteristics. For example, 2 patients with osteoporosis complicated by a bone fracture will be closer than 2 patients with osteoporosis without a fracture. In addition, although our algorithm does not directly consider biological results in a quantitative manner, the clinician’s interpretation of these results in the text is systematically integrated and analyzed as a symptom, for example, anemia, hypoalbuminemia, and positive antibodies. Similarly, the pathological description of imaging reports, such as an alveolar condensation in radiology images or an abnormal left ventricular ejection fraction in echocardiograms will be taken into account in our algorithm. We show very good results in terms of precision and average precision for selecting similar patient cohorts. The robustness of the algorithm is demonstrated on the one hand by the evaluation of the precision-to-3, which is calculated here not for the construction of the cohort but rather to show that there is, as expected, a gradient of similarity from the closest to the most distant patients, and on the other hand, as shown in the error analysis, patients close to a given index patient had very similar disease, even if the exact phenotype was not encountered.</p>
      </sec>
      <sec>
        <title>Comparison With Previous Work</title>
        <p>Other studies have focused on patient similarity cohorts; for instance, in the French language, Garcelon et al [<xref ref-type="bibr" rid="ref30">30</xref>] used a patient representation and a similarity measure to try to find patients with rare diseases in the Dr Warehouse database [<xref ref-type="bibr" rid="ref31">31</xref>]. Although their objective is quite similar to ours, they used a different representation based on the term frequency–inverse document frequency weights of the extracted concept in each clinical note, and the concept extraction is based on handcrafted rules. They obtained a percentage of 71%-99% of indexed patients returning at least one similar true-positive patient within the first 30 similar patients, and the average number of patients with exactly the same disease among the 30 patients was 51%. In a second study based on the same term frequency–inverse document frequency similarity metric, they evaluated the association between clinical phenotypes and rare disease and measured the relevance of the first 50 similar patients by a domain expert a posteriori; they obtained average precision from 0.55 to 0.91 on 6 phenotypes with mean average precision of 0.79 [<xref ref-type="bibr" rid="ref32">32</xref>]. The main differences from our method are that we focus on clinical interpretability, and our metric computation is based on one of the most recent and performant language models [<xref ref-type="bibr" rid="ref12">12</xref>]. Moreover, in our case, the test set was annotated a priori. Jia et al [<xref ref-type="bibr" rid="ref33">33</xref>] also proposed an interesting algorithm for diagnostic prediction based on patient similarity, but unlike our method, their named-entity recognition step is based on a dictionary of symptoms, while disorders are extracted from ICD-10 coding. The similarity regarding symptoms is binary: 1 if the symptom is shared by both patients and 0 if otherwise. The similarity of diseases is based on their respective ICD-10 similarity (using the ICD-10 coding tree structure).</p>
        <p>Ng et al [<xref ref-type="bibr" rid="ref34">34</xref>] presented an insightful method based on a precision cohort (ie, patient-similarity cohorts) to help clinicians make treatment decisions for chronic diseases. They trained a global similarity model on a set of thousands of predefined variables (disease variables were constructed using their ICD-9 and ICD-10 codes, laboratory variables with their Logical Observation Identifiers Names and Codes, etc) that learns a disease-specific distance (for the 3 chronic diseases presented: hypertension, type 2 diabetes mellitus, and hyperlipidemia), with significant manual work to build the training data set. The authors did not compute direct measures of similarity cohorts but the direct impact of their method, with 75%, 74%, and 85% of decision points in hypertension, diabetes, and hyperlipidemia, respectively, and with at least one significantly better treatment. In contrast, our method focused on the performance of the similarity cohorts with metrics used in the information retrieval field, does not rely on manual variable definition, and does not learn disease-specific distance but a completely generic distance. One of the main advantages of our work is the original calculation of distance per class between patients; to the best of our knowledge, there is no similar work in the literature to compare our work to. However, we show that the named-entity recognition algorithm obtained state-of-the-art results, and the multilabel classification obtained the same performance as the best team of a French national challenge [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our work has several limitations. First, it does not cover mental health diseases, which are a completely different branch of the MeSH classification. However, training the multilabel classifier with a new label for mental health diseases with MeSH terms and synonyms can be done fairly directly based on our framework. In addition, due to time constraints, the data used in this paper were labeled by only 1 internist, and the quality of the data labeling cannot be assessed. In addition, one could argue that we did not compare our clustering and cohort similarity extraction with an ICD-10 extraction. However, because we built our initial data set with ICD-10 codes for our 4 main pathologies, we had an initial bias that we could not overcome for fair comparison. In addition, nephritis in SLE, ILD in systemic sclerosis, and lung infections do not have direct ICD-10 codes used in clinical practice. For example, “glomerular disease with SLE” has the ICD-10 “M3214” but in the entire database of 39 different hospitals, no patient had this particular code. This is because the coding is primarily done to describe the severity of the patient being managed, and this last code, in particular, does not reflect the severity of the renal involvement (in our case, codes for nephritis usually used would be N03, N04, or N05 and M320, M321, M328, and M329 for SLE). Similarly, scleroderma with pulmonary involvement has an ICD-10 code M348 that also does not appear in our database.</p>
        <p>Assuming that an important clinical fact is repeated several times in a clinical report (eg, a patient hospitalized for acute coronary syndrome will have many cardiovascular terms linked to his/her cardiac condition), our distance computation from equations 1 and 2 depends on the number of terms in the document. Hence, 2 patients with the same major (repeated) problem would be relatively close. However, sometimes, repeated terms are not directly derived from a major clinical fact (for instance, medical history may be repeated several times without clinical relevance).</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>In this work, we have presented a novel end-to-end interpretable algorithm to automatically extract similar patients from an index patient based on clinical note analysis. Our algorithm shows good performance results for 4 specific phenotypes in the context of 4 systemic diseases. In this work, we focused only on pathological signs, but in clinical practice, one could also be interested in negative signs (for instance, the absence of Raynaud syndrome is very atypical in systemic sclerosis). This will be added in our future work, thereby adding a new physiological dimension to patients. In future work, the drug information will also be added for patient comparison, and similar to our presented approach, the clinician will then be able to focus only on treatments or on treatments and signs and symptoms. Finally, we will consider patients as a set of multiple longitudinal hospitalization reports (EHRs). An important perspective of this work is also to evaluate this tool in clinical practice.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Examples of terms extracted from a hospitalization report close to an index patient with interstitial lung disease.</p>
        <media xlink:href="medinform_v10i12e42379_app1.docx" xlink:title="DOCX File , 63 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AP-HP</term>
          <def>
            <p>Assistance Publique-Hôpitaux de Paris</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">APS</term>
          <def>
            <p>antiphospholipid syndrome</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">ICD-9/ICD-10</term>
          <def>
            <p>International Classification of Diseases, Ninth/Tenth Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ILD</term>
          <def>
            <p>interstitial lung disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">MeSH</term>
          <def>
            <p>medical subject heading</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SLE</term>
          <def>
            <p>systemic lupus erythematosus</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to thank the Assistance Publique-Hôpitaux de Paris data warehouse, which provided the data and the computing power to carry out this study under good conditions. We would like to thank all the medical colleges, including the college of internal medicine, especially Prof Jean-Emmanuel Kahn, Dr Guillaume Bussone, Prof Sébastien Abad, Dr Virginie Zarrouk, Dr Noémie Chanson, Dr Antoine Dossier, Prof Luc Mouthon, and Dr Geoffrey Cheminet from the department of rheumatology. We would also like to thank Dr Augustin Latourte, Dr Florent Eymard, Prof Xavier Mariette, Dr Gaétane Nocturne, Prof Raphaele Serror, Prof Sébastien Ottaviani, Prof Francis Berenbaum, Prof Jérémie Sellam, Prof Yannick Allanore, Prof Jérôme Avouac, Prof Maxime Breban, Dr Félicie Costantino, and doctors from the dermatology, nephrology, pneumology, hepato-gastroenterology, hematology, endocrinology, gynecology, infectiology, cardiology, oncology, emergency, and intensive care units, who gave their agreements for the use of the clinical data.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during this study (anonymized similarity measures between patients for the 4 use cases described in this paper) are available in the data repository at this link [<xref ref-type="bibr" rid="ref35">35</xref>]. The data sets analyzed in this study are not publicly available due the confidentiality of data from patient records, even after deidentification. However, access to the Assistance Publique-Hôpitaux de Paris data warehouse’s raw data can be granted following the process described on its website [<xref ref-type="bibr" rid="ref36">36</xref>] by contacting the Ethical and Scientific Community at secretariat.cse@aphp.fr. A prior validation of the access by the local institutional review board is required. In the case of researchers who are not from the Assistance Publique-Hôpitaux de Paris, the signature of a collaboration contract is mandatory.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>CG was involved in conceptualization, data curation, formal analysis, investigation, methodology, software validation, writing the original draft, reviewing, and editing. Arthur M was involved in data curation, methodology, annotation, and writing the original draft. Arsène M was involved in designing the methodology and writing the original draft. XT was involved in conceptualization, formal analysis, methodology design, writing the original draft, reviewing, and editing. FC was involved in conceptualization, methodology, project administration, supervision, writing the original draft, reviewing, and editing.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Danciu</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Alamudun</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bitterman</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Tourassi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Warner</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Use of Natural Language Processing to Extract Clinical Cancer Phenotypes from Electronic Medical Records</article-title>
          <source>Cancer Res</source>
          <year>2019</year>
          <month>11</month>
          <day>01</day>
          <volume>79</volume>
          <issue>21</issue>
          <fpage>5463</fpage>
          <lpage>5470</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31395609"/>
          </comment>
          <pub-id pub-id-type="doi">10.1158/0008-5472.CAN-19-0579</pub-id>
          <pub-id pub-id-type="medline">31395609</pub-id>
          <pub-id pub-id-type="pii">0008-5472.CAN-19-0579</pub-id>
          <pub-id pub-id-type="pmcid">PMC7227798</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Galvin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Davidzon</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>A Database-driven Decision Support System: Customized Mortality Prediction</article-title>
          <source>J Pers Med</source>
          <year>2012</year>
          <month>09</month>
          <day>27</day>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>138</fpage>
          <lpage>48</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/23766893"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/jpm2040138</pub-id>
          <pub-id pub-id-type="medline">23766893</pub-id>
          <pub-id pub-id-type="pmcid">PMC3678286</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lieu</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Herrinton</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Buzkov</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lyons</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Neugebauer</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Needham</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Prausnitz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Van Den Eeden</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Baer</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Developing a Prognostic Information System for Personalized Care in Real Time</article-title>
          <source>EGEMS (Wash DC)</source>
          <year>2019</year>
          <month>03</month>
          <day>25</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>2</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30937324"/>
          </comment>
          <pub-id pub-id-type="doi">10.5334/egems.266</pub-id>
          <pub-id pub-id-type="medline">30937324</pub-id>
          <pub-id pub-id-type="pmcid">PMC6437692</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Frankovich</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Longhurst</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Sutherland</surname>
              <given-names>SM</given-names>
            </name>
          </person-group>
          <article-title>Evidence-based medicine in the EMR era</article-title>
          <source>N Engl J Med</source>
          <year>2011</year>
          <month>11</month>
          <day>10</day>
          <volume>365</volume>
          <issue>19</issue>
          <fpage>1758</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMp1108726</pub-id>
          <pub-id pub-id-type="medline">22047518</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Callahan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Polony</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Posada</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Banda</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Gombar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
          </person-group>
          <article-title>ACE: the Advanced Cohort Engine for searching longitudinal patient records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2021</year>
          <month>07</month>
          <day>14</day>
          <volume>28</volume>
          <issue>7</issue>
          <fpage>1468</fpage>
          <lpage>1479</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33712854"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocab027</pub-id>
          <pub-id pub-id-type="medline">33712854</pub-id>
          <pub-id pub-id-type="pii">6169466</pub-id>
          <pub-id pub-id-type="pmcid">PMC8279796</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Névéol</surname>
              <given-names>Aurélie</given-names>
            </name>
            <name name-style="western">
              <surname>Dalianis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Velupillai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zweigenbaum</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Clinical Natural Language Processing in languages other than English: opportunities and challenges</article-title>
          <source>J Biomed Semantics</source>
          <year>2018</year>
          <month>03</month>
          <day>30</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>12</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jbiomedsem.biomedcentral.com/articles/10.1186/s13326-018-0179-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13326-018-0179-8</pub-id>
          <pub-id pub-id-type="medline">29602312</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13326-018-0179-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC5877394</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Farzandipour</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikhtaheri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sadoughi</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Effective factors on accuracy of principal diagnosis coding based on International Classification of Diseases, the 10th revision (ICD-10)</article-title>
          <source>International Journal of Information Management</source>
          <year>2010</year>
          <month>2</month>
          <volume>30</volume>
          <issue>1</issue>
          <fpage>78</fpage>
          <lpage>84</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.ijinfomgt.2009.07.002"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijinfomgt.2009.07.002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benkhaial</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kaltschmidt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weisshaar</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Diepgen</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Haefeli</surname>
              <given-names>WE</given-names>
            </name>
          </person-group>
          <article-title>Prescribing errors in patients with documented drug allergies: comparison of ICD-10 coding and written patient notes</article-title>
          <source>Pharm World Sci</source>
          <year>2009</year>
          <month>08</month>
          <volume>31</volume>
          <issue>4</issue>
          <fpage>464</fpage>
          <lpage>472</lpage>
          <pub-id pub-id-type="doi">10.1007/s11096-009-9300-5</pub-id>
          <pub-id pub-id-type="medline">19412703</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11096-009-9300-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Efficient estimation of word representations in vector space</article-title>
          <source>ArXiv</source>
          <year>2013</year>
          <fpage>1</fpage>
          <lpage>12</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/234131319_Efficient_Estimation_of_Word_Representations_in_Vector_Space"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1301.3781</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <collab>Socher</collab>
          </person-group>
          <article-title>Global vectors for word representation</article-title>
          <year>2014</year>
          <conf-name>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <publisher-loc>Glove</publisher-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.aclweb.org/anthology/D14-1162"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Enriching Word Vectors with Subword Information</article-title>
          <source>TACL</source>
          <year>2017</year>
          <month>12</month>
          <volume>5</volume>
          <fpage>135</fpage>
          <lpage>146</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.48550/arXiv.1607.04606"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <collab>Lee</collab>
          </person-group>
          <article-title>BERT: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>ACL Anthology</source>
          <access-date>2018-10-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N19-1423.pdf">https://aclanthology.org/N19-1423.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>De Freitas</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Golden</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>GN</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Bottinger</surname>
              <given-names>EP</given-names>
            </name>
            <name name-style="western">
              <surname>Glicksberg</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Miotto</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Phe2vec: Automated disease phenotyping based on unsupervised embeddings from electronic health records</article-title>
          <source>Patterns (N Y)</source>
          <year>2021</year>
          <month>09</month>
          <day>10</day>
          <volume>2</volume>
          <issue>9</issue>
          <fpage>100337</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-3899(21)00185-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.patter.2021.100337</pub-id>
          <pub-id pub-id-type="medline">34553174</pub-id>
          <pub-id pub-id-type="pii">S2666-3899(21)00185-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC8441576</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jonquet</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Musen</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>The open biomedical annotator</article-title>
          <source>Summit Transl Bioinform</source>
          <year>2009</year>
          <month>03</month>
          <day>01</day>
          <volume>2009</volume>
          <fpage>56</fpage>
          <lpage>60</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/21347171"/>
          </comment>
          <pub-id pub-id-type="medline">21347171</pub-id>
          <pub-id pub-id-type="pmcid">PMC3041576</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ferté</surname>
              <given-names>Thomas</given-names>
            </name>
            <name name-style="western">
              <surname>Cossin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schaeverbeke</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Barnetche</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jouhet</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Hejblum</surname>
              <given-names>BP</given-names>
            </name>
          </person-group>
          <article-title>Automatic phenotyping of electronical health record: PheVis algorithm</article-title>
          <source>J Biomed Inform</source>
          <year>2021</year>
          <month>05</month>
          <volume>117</volume>
          <fpage>103746</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(21)00075-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2021.103746</pub-id>
          <pub-id pub-id-type="medline">33746080</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(21)00075-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <source>FAI2R</source>
          <access-date>2022-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.fai2r.org/">https://www.fai2r.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <source>Takayasu Arteritis</source>
          <access-date>2022-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.has-sante.fr/upload/docs/application/pdf/2020-01/pnds_takayasu_fair_-_favamulti.pdf">https://www.has-sante.fr/upload/docs/application/pdf/2020-01/pnds_takayasu_fair_-_favamulti.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gérardin</surname>
              <given-names>Christel</given-names>
            </name>
            <name name-style="western">
              <surname>Wajsbürt</surname>
              <given-names>Perceval</given-names>
            </name>
            <name name-style="western">
              <surname>Vaillant</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bellamine</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Carrat</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tannier</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Multilabel classification of medical concepts for patient clinical profile identification</article-title>
          <source>Artif Intell Med</source>
          <year>2022</year>
          <month>06</month>
          <volume>128</volume>
          <fpage>102311</fpage>
          <pub-id pub-id-type="doi">10.1016/j.artmed.2022.102311</pub-id>
          <pub-id pub-id-type="medline">35534148</pub-id>
          <pub-id pub-id-type="pii">S0933-3657(22)00076-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <source>CNIL</source>
          <access-date>2018-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cnil.fr/en/home">https://www.cnil.fr/en/home</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <article-title>MeSH</article-title>
          <source>National Center for Biotechnology Information</source>
          <access-date>2017-02-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ncbi.nlm.nih.gov/mesh/">https://www.ncbi.nlm.nih.gov/mesh/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Muller</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Suárez</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Dupont</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Romary</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>de La Clergerie</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>CamemBERT: a tasty French language model</article-title>
          <source>ACL Anthology</source>
          <access-date>2020-07-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.acl-main.645.pdf">https://aclanthology.org/2020.acl-main.645.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Sang</collab>
            <name name-style="western">
              <surname>Erik</surname>
              <given-names>F</given-names>
            </name>
            <collab>Fien De Meulder</collab>
          </person-group>
          <article-title>Introduction to the CoNLL-2003 shared task: language-independent named entity recognition</article-title>
          <source>ACL Anthology</source>
          <year>2003</year>
          <access-date>2003-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W03-0419.pdf">https://aclanthology.org/W03-0419.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ohta</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tateisi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tsujii</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>GENIA corpus--semantically annotated corpus for bio-textmining</article-title>
          <source>Bioinformatics</source>
          <year>2003</year>
          <volume>19 Suppl 1</volume>
          <fpage>i180</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btg1023</pub-id>
          <pub-id pub-id-type="medline">12855455</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <article-title>Unified medical language system</article-title>
          <source>National Library of Medicine</source>
          <access-date>2022-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nlm.nih.gov/research/umls/index.html">https://www.nlm.nih.gov/research/umls/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kusner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <collab>Kolkin</collab>
          </person-group>
          <article-title>From word embeddings to document distances</article-title>
          <year>2015</year>
          <conf-name>ICML'15: Proceedings of the 32nd International Conference on International Conference on Machine Learning</conf-name>
          <conf-date>July 6-11</conf-date>
          <conf-loc>Lille, France</conf-loc>
          <fpage>957</fpage>
          <lpage>966</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.5555/3045118.3045221"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Virtanen</surname>
              <given-names>Pauli</given-names>
            </name>
            <name name-style="western">
              <surname>Gommers</surname>
              <given-names>Ralf</given-names>
            </name>
            <name name-style="western">
              <surname>Oliphant</surname>
              <given-names>Travis E</given-names>
            </name>
            <name name-style="western">
              <surname>Haberland</surname>
              <given-names>Matt</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>Tyler</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>David</given-names>
            </name>
            <name name-style="western">
              <surname>Burovski</surname>
              <given-names>Evgeni</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>Pearu</given-names>
            </name>
            <name name-style="western">
              <surname>Weckesser</surname>
              <given-names>Warren</given-names>
            </name>
            <name name-style="western">
              <surname>Bright</surname>
              <given-names>Jonathan</given-names>
            </name>
            <name name-style="western">
              <surname>van der Walt</surname>
              <given-names>Stéfan J</given-names>
            </name>
            <name name-style="western">
              <surname>Brett</surname>
              <given-names>Matthew</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>Joshua</given-names>
            </name>
            <name name-style="western">
              <surname>Millman</surname>
              <given-names>K Jarrod</given-names>
            </name>
            <name name-style="western">
              <surname>Mayorov</surname>
              <given-names>Nikolay</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>Andrew R J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>Eric</given-names>
            </name>
            <name name-style="western">
              <surname>Kern</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Larson</surname>
              <given-names>Eric</given-names>
            </name>
            <name name-style="western">
              <surname>Carey</surname>
              <given-names>C J</given-names>
            </name>
            <name name-style="western">
              <surname>Polat</surname>
              <given-names>İlhan</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>Yu</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>Eric W</given-names>
            </name>
            <name name-style="western">
              <surname>VanderPlas</surname>
              <given-names>Jake</given-names>
            </name>
            <name name-style="western">
              <surname>Laxalde</surname>
              <given-names>Denis</given-names>
            </name>
            <name name-style="western">
              <surname>Perktold</surname>
              <given-names>Josef</given-names>
            </name>
            <name name-style="western">
              <surname>Cimrman</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Henriksen</surname>
              <given-names>Ian</given-names>
            </name>
            <name name-style="western">
              <surname>Quintero</surname>
              <given-names>E A</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>Charles R</given-names>
            </name>
            <name name-style="western">
              <surname>Archibald</surname>
              <given-names>Anne M</given-names>
            </name>
            <name name-style="western">
              <surname>Ribeiro</surname>
              <given-names>Antônio H</given-names>
            </name>
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>Fabian</given-names>
            </name>
            <name name-style="western">
              <surname>van Mulbregt</surname>
              <given-names>Paul</given-names>
            </name>
            <collab>SciPy 1.0 Contributors</collab>
          </person-group>
          <article-title>Author Correction: SciPy 1.0: fundamental algorithms for scientific computing in Python</article-title>
          <source>Nat Methods</source>
          <year>2020</year>
          <month>03</month>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>352</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32094914"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41592-020-0772-5</pub-id>
          <pub-id pub-id-type="medline">32094914</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41592-020-0772-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC7056641</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <article-title>Patient similarity demo</article-title>
          <source>Xavier Tannier</source>
          <year>2022</year>
          <access-date>2022-05-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://xavier.tannier.free.fr/misc/patient_similarity/demo.html">http://xavier.tannier.free.fr/misc/patient_similarity/demo.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gérardin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Cohort similarity</article-title>
          <source>GitHub</source>
          <year>2022</year>
          <access-date>2022-05-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/ChristelDG/cohort-similarity">https://github.com/ChristelDG/cohort-similarity</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garcelon</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Neuraz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Benoit</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Salomon</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kracker</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Suarez</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Bahi-Buisson</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hadj-Rabia</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fischer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Munnich</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Burgun</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Finding patients using similarity measures in a rare diseases-oriented clinical data warehouse: Dr. Warehouse and the needle in the needle stack</article-title>
          <source>J Biomed Inform</source>
          <year>2017</year>
          <month>09</month>
          <volume>73</volume>
          <fpage>51</fpage>
          <lpage>61</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(17)30176-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2017.07.016</pub-id>
          <pub-id pub-id-type="medline">28754522</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(17)30176-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garcelon</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Neuraz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Salomon</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Faour</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Benoit</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Delapalme</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Munnich</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Burgun</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rance</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A clinician friendly data warehouse oriented toward narrative reports: Dr. Warehouse</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>04</month>
          <volume>80</volume>
          <fpage>52</fpage>
          <lpage>63</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30038-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.02.019</pub-id>
          <pub-id pub-id-type="medline">29501921</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30038-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garcelon</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Neuraz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Salomon</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bahi-Buisson</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Amiel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Picard</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mahlaoui</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Benoit</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Burgun</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rance</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Next generation phenotyping using narrative reports in a rare disease clinical data warehouse</article-title>
          <source>Orphanet J Rare Dis</source>
          <year>2018</year>
          <month>05</month>
          <day>31</day>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>85</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ojrd.biomedcentral.com/articles/10.1186/s13023-018-0830-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13023-018-0830-6</pub-id>
          <pub-id pub-id-type="medline">29855327</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13023-018-0830-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC5984368</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A patient-similarity-based model for diagnostic prediction</article-title>
          <source>Int J Med Inform</source>
          <year>2020</year>
          <month>03</month>
          <volume>135</volume>
          <fpage>104073</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1386-5056(19)31092-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2019.104073</pub-id>
          <pub-id pub-id-type="medline">31923816</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(19)31092-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kartoun</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Stavropoulos</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zambrano</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>PC</given-names>
            </name>
          </person-group>
          <article-title>Personalized treatment options for chronic diseases using precision cohort analytics</article-title>
          <source>Sci Rep</source>
          <year>2021</year>
          <month>01</month>
          <day>13</day>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>1139</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-021-80967-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-021-80967-5</pub-id>
          <pub-id pub-id-type="medline">33441956</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-021-80967-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC7806725</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gérardin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Cohort similarity main data</article-title>
          <source>GitHub</source>
          <year>2022</year>
          <access-date>2022-05-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/ChristelDG/cohort-similarity/tree/main/data">https://github.com/ChristelDG/cohort-similarity/tree/main/data</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <article-title>Entrepot de données de Santé de l'AP-HP</article-title>
          <source>Citrix Gateway</source>
          <access-date>2022-05-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.eds.aphp.fr">https://www.eds.aphp.fr</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
