<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i11e23930</article-id>
      <article-id pub-id-type="pmid">33252349</article-id>
      <article-id pub-id-type="doi">10.2196/23930</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Machine Learning Electronic Health Record Identification of Patients with Rheumatoid Arthritis: Algorithm Pipeline Development and Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Rodriguez Rodriguez</surname>
            <given-names>Luis</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Popa</surname>
            <given-names>Calin</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Maarseveen</surname>
            <given-names>Tjardo D</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1701-1040</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Meinderink</surname>
            <given-names>Timo</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6720-1446</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Reinders</surname>
            <given-names>Marcel J T</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1148-1562</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Knitza</surname>
            <given-names>Johannes</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9695-0657</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Huizinga</surname>
            <given-names>Tom W J</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7033-7520</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Kleyer</surname>
            <given-names>Arnd</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2026-7728</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Simon</surname>
            <given-names>David</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8310-7820</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>van den Akker</surname>
            <given-names>Erik B</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7693-0728</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Knevel</surname>
            <given-names>Rachel</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Rheumatology</institution>
            <institution>Leiden University Medical Center</institution>
            <addr-line>C1-R k. 41</addr-line>
            <addr-line>Albinusdreef 2</addr-line>
            <addr-line>Leiden, 2333 ZA</addr-line>
            <country>Netherlands</country>
            <phone>31 611307780</phone>
            <email>R.Knevel@lumc.nl</email>
          </address>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7494-3023</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Rheumatology</institution>
        <institution>Leiden University Medical Center</institution>
        <addr-line>Leiden</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Internal Medicine 3</institution>
        <institution>Friedrich-Alexander University Erlangen‐Nuremberg</institution>
        <addr-line>Erlangen</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Deutsches Zentrum für Immuntherapie</institution>
        <institution>Erlangen-Nuremberg and Universitätsklinikum</institution>
        <addr-line>Erlangen</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Leiden Computational Biology Centre</institution>
        <institution>Leiden University Medical Center</institution>
        <addr-line>Leiden</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Molecular Epidemiology</institution>
        <institution>Leiden University Medical Center</institution>
        <addr-line>Leiden</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Division of Rheumatology, Inflammation and Immunity</institution>
        <institution>Brigham and Women's Hospital</institution>
        <institution>Harvard Medical School</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Rachel Knevel <email>R.Knevel@lumc.nl</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>30</day>
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>11</issue>
      <elocation-id>e23930</elocation-id>
      <history>
        <date date-type="received">
          <day>28</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>18</day>
          <month>10</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>24</day>
          <month>10</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Tjardo D Maarseveen, Timo Meinderink, Marcel J T Reinders, Johannes Knitza, Tom W J Huizinga, Arnd Kleyer, David Simon, Erik B van den Akker, Rachel Knevel. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 30.11.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2020/11/e23930/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Financial codes are often used to extract diagnoses from electronic health records. This approach is prone to false positives. Alternatively, queries are constructed, but these are highly center and language specific. A tantalizing alternative is the automatic identification of patients by employing machine learning on format-free text entries.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this study was to develop an easily implementable workflow that builds a machine learning algorithm capable of accurately identifying patients with rheumatoid arthritis from format-free text fields in electronic health records.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Two electronic health record data sets were employed: Leiden (n=3000) and Erlangen (n=4771). Using a portion of the Leiden data (n=2000), we compared 6 different machine learning methods and a naïve word-matching algorithm using 10-fold cross-validation. Performances were compared using the area under the receiver operating characteristic curve (AUROC) and the area under the precision recall curve (AUPRC), and F1 score was used as the primary criterion for selecting the best method to build a classifying algorithm. We selected the optimal threshold of positive predictive value for case identification based on the output of the best method in the training data. This validation workflow was subsequently applied to a portion of the Erlangen data (n=4293). For testing, the best performing methods were applied to remaining data (Leiden n=1000; Erlangen n=478) for an unbiased evaluation.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>For the Leiden data set, the word-matching algorithm demonstrated mixed performance (AUROC 0.90; AUPRC 0.33; F1 score 0.55), and 4 methods significantly outperformed word-matching, with support vector machines performing best (AUROC 0.98; AUPRC 0.88; F1 score 0.83). Applying this support vector machine classifier to the test data resulted in a similarly high performance (F1 score 0.81; positive predictive value [PPV] 0.94), and with this method, we could identify 2873 patients with rheumatoid arthritis in less than 7 seconds out of the complete collection of 23,300 patients in the Leiden electronic health record system. For the Erlangen data set, gradient boosting performed best (AUROC 0.94; AUPRC 0.85; F1 score 0.82) in the training set, and applied to the test data, resulted once again in good results (F1 score 0.67; PPV 0.97).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We demonstrate that machine learning methods can extract the records of patients with rheumatoid arthritis from electronic health record data with high precision, allowing research on very large populations for limited costs. Our approach is language and center independent and could be applied to any type of diagnosis. We have developed our pipeline into a universally applicable and easy-to-implement workflow to equip centers with their own high-performing algorithm. This allows the creation of observational studies of unprecedented size covering different countries for low cost from already available data in electronic health record systems.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>Supervised machine learning</kwd>
        <kwd>Electronic Health Records</kwd>
        <kwd>Natural Language Processing</kwd>
        <kwd>Support Vector Machine</kwd>
        <kwd>Gradient Boosting</kwd>
        <kwd>Rheumatoid Arthritis</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Electronic health records (EHR) offer an interesting collection of clinical information for observational research, yet a crucial step is an accurate identification of disease cases. This is commonly done by manual chart review or by using standardized billing codes. However, these methods are either labor-intensive or prone to including false positives. Previous studies [<xref ref-type="bibr" rid="ref1">1</xref>] found that using only standardized billing codes, for example, ≥3 International Classification of Diseases, Ninth Revision (ICD-9) rheumatoid arthritis codes, results in a positive predictive value (PPV) of 56% (95% CI 47%-64%). Using a combination of billing code with a disease-modifying antirheumatic drug code (≥1 ICD-9 rheumatoid arthritis code plus ≥1 disease-modifying antirheumatic drug) results in a PPV of 45% (95% CI 37%-53%). Clinical diagnoses can also be inferred by performing naïve word-matching on format-free text fields. This approach does not take into account the provided context and is thus prone to false positives as well.</p>
      <p>Alternatively, query-like algorithms can be used. However, these algorithms require knowledge on the diagnosis of interest, biasing the inclusion of potential study cases. For example, when we want to identify patients with rheumatoid arthritis, we can select people with cyclic citrullinated peptide antibodies that were treated with methotrexate. Those identified likely concern true cases of rheumatoid arthritis but are biased as patients with rheumatoid arthritis do not always receive methotrexate and do not all have cyclic citrullinated peptide–positive tests. On the other hand, selecting only methotrexate would create many false positives as methotrexate is prescribed for many other rheumatic diseases. An additional disadvantage is that rule-based algorithms tend to be center-specific and perform less well in other clinics [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      <p>Advancements in natural language processing and machine learning have created great potential for processing format-free text data such as those in EHRs [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. A major advantage of machine learning is that it can learn extraction patterns from a set of training examples, relieving the need for extensive domain knowledge. We set out to explore the utility of machine learning methods to identify patients with rheumatoid arthritis from format-free text fields in EHRs. As machine learning methods learn from presented training examples, they can suffer from intercenter variability due to different notation characteristics in EHRs [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      <p>Therefore, the aim of this study was to develop a broadly applicable workflow that employs machine learning methods to identify patients with rheumatoid arthritis from format-free text fields of EHRs. Additionally, the workflow should be easy to implement and require only the annotation of a subset of the total data set.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Patients’ Data Collection</title>
        <sec>
          <title>Overview</title>
          <p>For this study, we employed 2 data sets: Leiden (the Netherlands) and Erlangen (Germany). See <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (Table S1) for a convenient overview of the study outline for both centers.</p>
        </sec>
        <sec>
          <title>Leiden Data Set</title>
          <p>We retrieved EHR data from patients (n=23,300) who visited the rheumatology outpatient clinic of the Leiden University Medical Centre since 2011 (<xref rid="figure1" ref-type="fig">Figure 1</xref>). We used the <italic>Conclusion</italic> section of the patient records, which consisted of format-free text fields describing the symptoms and (differential) diagnoses of the patient. From these dossiers, 11,786 patients had a first visit after the initiation of the digital system in 2011 [<xref ref-type="bibr" rid="ref4">4</xref>]. We randomly selected 3000 patients from these newly referred patients and extracted all of their entries for up to 1 year of follow-up. A clinician manually reviewed all entries and annotated the final diagnosis based on all entries. The data were divided into 2 independent sets with a 66/33 split: Leiden-A (n=2000) for model selection, training, and validation and Leiden-B (n=1000) for independent testing. The study was approved by the local ethics board.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Study outline of the Leiden cohort. EHR: electronic health record; NLP: natural language processing.</p>
            </caption>
            <graphic xlink:href="medinform_v8i11e23930_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Erlangen Data Set</title>
          <p>After model selection, training, and validation analyses were performed on the Leiden data, we evaluated the universal applicability of our pipeline by applying it to the EHR data from a second center. We retrieved admission notes from the EHR database of University Hospital Erlangen (Department of Internal Medicine 3 Rheumatology and Immunology, Universitätsklinikum). The <italic>course &#38; assessment</italic> component was used because it featured the patient status descriptions. These data consisted of 4771 patients in total featuring all their entries up to 1 year of follow-up. A health care professional manually reviewed all entries and annotated the final diagnosis based on all entries. The Erlangen data set was divided into 2 independent sets with a 90/10 split: Erlangen-A (n=4293) for model and Erlangen-B (n=478) for testing. The study was approved by the local ethics board.</p>
        </sec>
      </sec>
      <sec>
        <title>Training, Model Selection, and Validation (Leiden-A and Erlangen-A)</title>
        <sec>
          <title>Preprocessing Format-Free Text</title>
          <p>We employed spell check and several natural language processing techniques to preprocess the extracted text with scikit-learn tools provided by Pedregosa et al [<xref ref-type="bibr" rid="ref5">5</xref>]. The pipeline can be divided into 5 steps: word segmentation, lowercase conversion, stop word removal, word normalization, and vectorization. First, we segmented the text into words, splitting by spaces and special characters. Next, we converted the text to lowercase and removed the irrelevant but highly prevalent stop words. Morphological variation was further reduced by applying lemmatization to normalize words to their base form. The tools provide lemmatization tools for many languages; we used the Dutch and German language tools. Segmented words were then aggregated by grouping neighboring words into sets of 3 (ie, <italic>n</italic>-grams such as <italic>patient</italic>, <italic>verdenking artritis</italic>). Finally, a <italic>term frequency by inverse document frequency</italic> transformation, which builds a clinical vocabulary and weighs words according to their occurrence, was applied to vectorize the text data.</p>
        </sec>
        <sec>
          <title>Training and Machine Learning Model Selection</title>
          <p>We tested the following machine learning methods: naïve Bayes [<xref ref-type="bibr" rid="ref6">6</xref>], neural networks [<xref ref-type="bibr" rid="ref7">7</xref>], random forest [<xref ref-type="bibr" rid="ref8">8</xref>], support vector machine [<xref ref-type="bibr" rid="ref9">9</xref>], gradient boosting [<xref ref-type="bibr" rid="ref10">10</xref>], decision tree [<xref ref-type="bibr" rid="ref8">8</xref>], and a random classifier, which assigns class labels at random with frequencies equal to those observed in the training set (parameters are shown in Table S2, <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Default scikit-learn implementations were used to create the machine learning models [<xref ref-type="bibr" rid="ref11">11</xref>].</p>
          <p>Furthermore, we employed a naïve word-matching algorithm that assigns rheumatoid arthritis status to a sample when the text contained rheumatoid arthritis (in German or Dutch) or its abbreviation appeared in the chart. Each classifier gives a score between 0 and 1 that we interpreted as a probability for each sample to be a case.</p>
          <p>We randomly split the Leiden-A and Erlangen-A in train and validation sets using a 10-fold cross-validation procedure for model selection [<xref ref-type="bibr" rid="ref12">12</xref>]. In short, for each sample set, different models were trained and evaluated in equally sized training and validation sets. Classification performances in the validation sets were then averaged over the samples to give robust estimates of each individually evaluated method to annotate unseen EHR records with a rheumatoid arthritis status.</p>
        </sec>
        <sec>
          <title>Performance Validation</title>
          <p>As each classifier generates a probability score of a rheumatoid arthritis, the performance of a classifier can be tested by applying different cut-offs for case identification. With these probabilities, we first generated receiver operating characteristic curves, plotting the true positive rate against the false positive rate for all probability scores. Second, we created precision-recall curves, plotting the precision (PPV) against the recall (sensitivity or true positive rate) for all score thresholds. Classification performance was then measured using the area under the receiver operating characteristic curve (AUROC) and the area under the precision curve (AUPRC) [<xref ref-type="bibr" rid="ref11">11</xref>]. For data sets with low case prevalence (imbalanced data), AUROC can be inaccurate and using AUPRC is preferred [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
          <p>To determine whether the performance of the method significantly differed from that of the word-matching method, we implemented the 5×2 cross-validation procedure described by Dietterich [<xref ref-type="bibr" rid="ref14">14</xref>]. The 5×2 cross-validation procedure splits the data into 2 equal sized sets each repetition. The differences between the classifiers are then estimated with a two-tailed paired <italic>t</italic> test with a significance level of 0.05. This approach takes into account the problem of dependence between the measurements.</p>
          <p>The F1 score served as the primary criterion for picking the final method. The F1 score reflects the trade-off between precision and recall as it is the harmonic mean of the two [<xref ref-type="bibr" rid="ref15">15</xref>]. The best performing model was compared to the other classifiers with two-tailed paired <italic>t</italic> tests (α=.05) in the 5×2 cross-validation, to evaluate whether the best performing model significantly outperformed the other candidates.</p>
        </sec>
        <sec>
          <title>Sensitivity Analyses</title>
          <p>We ran 2 sensitivity analyses on the Leiden data. To evaluate the influence of sample size on the performance of a classifier, we employed the classifier on the Leiden-A data set with decreasing sample sizes within the same 10-fold cross-validation setup. To test the effect of disease prevalence on the classifier’s performance, we created subsets of the Leiden-A set with different fractions of patients with rheumatoid arthritis, applied the classifier to this data and compared the AUPRC between the subsets.</p>
        </sec>
      </sec>
      <sec>
        <title>Final Method Testing of Case Identification (Leiden-B and Erlangen-B)</title>
        <p>In the final test phase (using the B data sets), we obtained reliable estimates of the selected model’s performance. We applied the trained model for the best performing method from the A data sets directly to the B data sets (Leiden-B, n=1000; Erlangen-B, n=478). To make a final call on rheumatoid arthritis status, one must define a threshold for the probability. The final test characteristics of the model are affected by the chosen probability cut-off. We report the PPV, sensitivity, and F1 score for each B data set at 2 operator points learned from the A data sets: (1) optimized PPV, thus favoring high-certainty cases and (2) optimized sensitivity, thus favoring the inclusive selection of cases.</p>
      </sec>
      <sec>
        <title>Implementation and Availability</title>
        <p>Machine learning methods, model training, and evaluations were performed with the scikit-learn package (version 0.21.2) in Python (version 3.5) [<xref ref-type="bibr" rid="ref11">11</xref>]. At all times, default implementations and default settings were used. All scripts including instructions on how to apply the methods are posted online [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data</title>
        <p>Leiden-A (n=2000) and Leiden-B (n=1000) annotated data sets had nearly equal percentages of patients with rheumatoid arthritis (Leiden-A: 154/2000, 7.7%; Leiden-B: 84/1000, 8.4%). Erlangen-A (n=4293) and Erlangen-B (n=478) annotated data sets also had nearly equal percentages of patients with rheumatoid arthritis (Erlangen-A: 1071/4293, 24.9%; Erlangen-B: 112/478, 23.4%).</p>
      </sec>
      <sec>
        <title>Leiden</title>
        <sec>
          <title>Preprocessing</title>
          <p>We found a total of 114,529 words and 8355 unique words in the Leiden-A data after segmentation. With lemmatization and lowercase conversion, the number of unique words was 8141. After removing the most common words with a stop word filter, only 88,524 words and 8078 unique words remained. There were 133,161 unique word combinations (<italic>n</italic>-grams) in the text. The term frequency by inverse document frequency transformation resulted in a sparse matrix of 2000×133,161.</p>
        </sec>
        <sec>
          <title>Performance Evaluation of Machine Learning Methods</title>
          <p>Naïve word-matching had overall a good performance (AUROC: mean 0.90, SD 0.02), which was significantly better (<italic>P</italic>&#60;.001) than that of a random classifier (AUROC: mean 0.50, SD 0.01). Although naïve word-matching showed good overall test performance, it had a low AUPRC value (mean 0.36 SD 0.07), indicating that the naïve word-matching would generate many false positives. Four machine learning methods outperformed naïve word-matching (AUROC: naïve Bayes mean 0.71, SD 0.03, <italic>P</italic>=.003; neural network: mean 0.98, SD 0, <italic>P</italic>=.005; random forest: mean 0.95, SD 0.01, <italic>P</italic>=.007; support vector machine: 0.98, SD 0.01, <italic>P</italic>=.004; gradient boosting: mean 0.98, SD 0.01, <italic>P</italic>=.003; decision tree: mean 0.86, SD 0.05, <italic>P</italic>=.06) (<xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>(A) Receiver operating characteristics  and (B) precision-recall curves for all machine learning methods (solid lines) and the naïve word-matching method (dotted line) in the training set (Leiden-A).</p>
            </caption>
            <graphic xlink:href="medinform_v8i11e23930_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>The support vector machine had the highest performance in comparison to that of word-matching (AUPRC: mean 0.90, SD 0.02; F1 score: mean 0.83 SD 0.02, <italic>P</italic>&#60;.001). However, the 5×2 cross-validation paired <italic>t</italic> tests revealed that the differences for gradient boosting (<italic>P</italic>=.61), neural network (<italic>P</italic>=.18), and random forest (<italic>P</italic>=.10) were not significant (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
        </sec>
        <sec>
          <title>Sensitivity Analyses</title>
          <p>We did not observe any significant loss of precision when lowering the number of training samples from 1000 (original) to 600 patients (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Neither the AUROC nor the AUPRC showed a significant difference (<italic>P</italic>=.17 and <italic>P</italic>=.11, respectively). Only when reducing the training set to 450 entries did we observe a significant discrepancy (<italic>P</italic>=.005 and <italic>P</italic>=.005, respectively).</p>
          <p>The classifier’s performance maintained an AUPRC &#62;0.80 in settings with highly different disease prevalence (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). Only when disease prevalence was below 4% or above 50% did we detect a difference in performance compared to that of the initial 8% prevalence.</p>
        </sec>
        <sec>
          <title>Cut-Off Selection</title>
          <p>We picked the support vector machine classifier with the median performance in the training stage. This classifier assigns a probability of being a rheumatoid arthritis to each patient by summing the coefficients of the features present in the clinical notes of the patient (<xref rid="figure3" ref-type="fig">Figure 3</xref>). The probability cut-offs for optimized PPV (&#62;0.95) and optimized sensitivity (&#62;0.95) were 0.99 and 0.53, respectively (<xref rid="figure4" ref-type="fig">Figure 4</xref>).</p>
          <p>The probability cut-off for optimized PPV resulted in the following test characteristics: PPV 0.96, sensitivity 0.70, specificity 1.00, negative predictive value [NPV] 1.00, and F1 score 0.81. The probability cut-off for optimized sensitivity resulted in the following test characteristics: PPV 0.72, sensitivity 0.96, specificity 0.97, NPV 1.00, and F1 score 0.82.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>The relative importance (coefficients) of the top 20 features in the Leiden-A data set according to the final support vector machine model. The initial data was in Dutch, we translated the words to English in this figure to improve readability.</p>
            </caption>
            <graphic xlink:href="medinform_v8i11e23930_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Swarm plot depicting the support vector machine–derived probability of being either non-rheumatoid arthritis (blue) or rheumatoid arthritis (green) for the Leiden-A data set. The dotted lines display the optimal cutoffs. Sens: sensitivity, Spec: specificity; PPV: positive predictive value; NPV: negative predictive value; Acc: accuracy; F1: F1 score.</p>
            </caption>
            <graphic xlink:href="medinform_v8i11e23930_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Final Method Testing of Case Identification</title>
          <p>In the Leiden-B data set, rheumatoid arthritis support vector machine classifier (<xref ref-type="table" rid="table1">Table 1</xref>) identified 64 cases with a cut-off of 0.99 (with corresponding PPV 0.94, sensitivity 0.71, specificity 1.00, NPV 0.97, and F1 score 0.81) and 104 cases with a cut-off of 0.53 (with corresponding PPV 0.75, sensitivity 0.93, specificity 0.97, NPV 0.99, and F1 score 0.83). In the complete Leiden data set of 23,300 patients using the first (precise) cut-off resulted in 2873 cases of rheumatoid arthritis and the second (inclusive) cut-off resulted in 6453 cases of rheumatoid arthritis.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Support vector machine confusion matrices for the Leiden-B test set (n=1000) .</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="240"/>
              <col width="190"/>
              <col width="190"/>
              <col width="190"/>
              <col width="190"/>
              <thead>
                <tr valign="top">
                  <td>Clinician-based</td>
                  <td colspan="2">Support vector machine 1 (cut-off=0.99)</td>
                  <td colspan="2">Support vector machine 2 (cut-off=0.53)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Non–rheumatoid arthritis</td>
                  <td>Rheumatoid arthritis</td>
                  <td>Non–rheumatoid arthritis</td>
                  <td>Rheumatoid arthritis</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Non–rheumatoid arthritis</td>
                  <td>912 (true negative)</td>
                  <td>4 (false positive)</td>
                  <td>890 (true negative)</td>
                  <td>26 (false positive)</td>
                </tr>
                <tr valign="top">
                  <td>Rheumatoid arthritis</td>
                  <td>24 (false negative)</td>
                  <td>60 (true positive)</td>
                  <td>6 (false negative)</td>
                  <td>78 (true positive)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Validation of Workflow in Erlangen Data</title>
        <sec>
          <title>Training and Model Selection</title>
          <p>To evaluate the universal applicability of the workflow, we employed the full pipeline on Erlangen data sets. Again, we ran all machine learning methods to find the best performing method using the Erlangen-A data set. Gradient boosting achieved the best performance (AUROC 0.94; AUPRC 0.85; F1 score 0.81) (<xref rid="figure5" ref-type="fig">Figure 5</xref>). The probability cut-offs for optimized PPV (&#62; 0.90) and optimized sensitivity (&#62;0.90) were 0.79 and 0.19, respectively (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>(A) Receiver operating characteristics  and (B) precision-recall curves for all machine learning methods (solid lines) and the naïve word-matching method (dotted line) in the training set (Erlangen-A).</p>
            </caption>
            <graphic xlink:href="medinform_v8i11e23930_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Final Method Testing of Case Identification</title>
          <p>When we applied the model on the test data set (Erlangen-B), we obtained similar performance (<xref ref-type="table" rid="table2">Table 2</xref>) with the predefined cut-offs as those found for the training data set (Erlangen-A). The gradient boosting classifier identified 59 cases with a cut-off of 0.79 (with corresponding PPV 0.97, sensitivity 0.51, specificity 0.99, NPV 0.87, and F1 score 0.67) and 131 cases with the cut-off of 0.19 (with corresponding PPV 0.72, sensitivity 0.84, specificity 0.90, NPV 0.95, and F1 score 0.77).</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Gradient boosting confusion matrices for the Erlangen-B test set (n=478).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="240"/>
              <col width="190"/>
              <col width="190"/>
              <col width="190"/>
              <col width="190"/>
              <thead>
                <tr valign="top">
                  <td>Clinician-based</td>
                  <td colspan="2">Gradient boosting 1 (cut-off=0.79)</td>
                  <td colspan="2">Gradient boosting 2 (cut-off=0.19)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Non–rheumatoid arthritis</td>
                  <td>Rheumatoid arthritis</td>
                  <td>Non–rheumatoid arthritis</td>
                  <td>Rheumatoid arthritis</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Non–rheumatoid arthritis</td>
                  <td>364 (true negative)</td>
                  <td>2 (false positive)</td>
                  <td>329 (true negative)</td>
                  <td>37 (false positive)</td>
                </tr>
                <tr valign="top">
                  <td>Rheumatoid arthritis</td>
                  <td>55 (false negative)</td>
                  <td>57 (true positive)</td>
                  <td>18 (false negative)</td>
                  <td>94 (true positive)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our study describes the results of a pipeline that applies multiple machine learning methods as well as naïve word-matching to create algorithms of case selection (patients with rheumatoid arthritis in our example) from electronical medical records. We observed that most methods outperform a naïve word matching algorithm. Our pipeline created algorithms on both Dutch and German data that showed a high performance in the testing and validation phase (F1 score 0.83 and 0.82 respectively). When we defined the cut-offs for case selection from the first data set aiming for either a high sensitivity or high PPV, we observed that the performances were robust in the second data sets (Leiden-B: PPV 0.94 and sensitivity 0.93; Erlangen-B: PPV 0.97 and sensitivity 0.84).</p>
        <p>We believe that our approach of making a center-specific algorithm is more attractive than the application of an algorithm developed elsewhere, since our method is more precise, doesn’t require standardization, and most importantly, it ensures high performance within the center. Our method only requires similar effort as the application of predefined algorithms, namely chart reviewing a subset of data. Furthermore, our workflow respects the user’s requirements regarding the case selection. The case selection can be tailored to being highly precise or sensitive depending on the chosen cut-off.</p>
        <p>Furthermore, this study shows the power of machine learning approaches to generate cohorts of patients in seconds, laying a foundation for allowing studies of cohorts with an unprecedented low cost.</p>
        <p>When applying our support vector machine classifier on the complete Leiden University Medical Centre’s database of 23,300 cases (including the 3000 annotated records) we identified 2873 rheumatoid arthritis cases when employing the stringent probability threshold of 0.99. The automatic annotation only took 6.17 seconds, a fraction of the amount of time it would take to review the medical charts manually.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>Our aim was to implement a broadly applicable workflow. The current versions require installing Anaconda (version 5.1.0) and Python (version 3.6). Researchers without any computational experience might feel certain reluctance to start the pipeline. We tested (without quantification) how easy someone outside our center could run the pipeline, by sending the scripts to scientists at Erlangen. Though they implemented the pipeline with relative ease, we do acknowledge that it was done by someone with experience in computational languages. Also, testing the pipeline in Erlangen exposed some unclarities in the scripts, which have been improved. The next step would be to perform a usability study, where we could ask users for their experience as well as test how much time it takes them to get the script running. We could further improve the usability of the pipeline by creation of a web-based interface where people could upload their data and get back their results automatically. This would require substantial computational resources as the data sets are large. In addition, we would need to ensure encryptions processes as clinical notes have a high risk to breach privacy.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>We want to note 3 important shortcomings of our study. The first limitation is that deploying the pipeline requires user familiarity with implementation software. Our proposed workflow facilitates building a classifier with a step-by-step implementation. Affinity with programming is not required, because all functions for training and evaluation are already provided. However, some software experience is beneficial when setting up the environment for the pipeline to run. With the emergence of machine learning and natural language processing we would argue that it becomes increasingly useful to possess the skills required to implement software.</p>
        <p>Second, we acknowledge that the workflow was evaluated in only 2 centers, both with Germanic languages. Although the pipeline provides language-specific preprocessing with pretrained tools for most languages, it would be interesting to investigate if similar performance can be achieved in centers with low lexical similarities to the Dutch language (eg, languages without a Latin-based alphabet).</p>
        <p>Finally, we acknowledge that the models’ performances can be further optimized by fine-tuning hyperparameters. These are parameters of the machine learning method that are provided prior to training the machine learning method. Additionally, it is possible to adjust the size of the n-grams to improve the performance. Since our models consistently performed very well in training and testing, we did not optimize any parameters in our study. Furthermore, we only evaluated a handful of candidate machine learning methods. Our selection is by no means an exhaustive list of available techniques in the field. We selected these methods as they cover a variety of machine learning method and are widely known.</p>
      </sec>
      <sec>
        <title>Lessons Learned</title>
        <p>We were able to conduct a stringent flow of training and testing, whereby we used several independent data sets to, first, optimize the classifiers, and second, to ensure reliable calculations of the classifiers’ performances by using k-fold cross-validation and both receiver operating characteristic and precision recall curves on 10-fold cross-validation, providing a good indication of performance on unseen data.</p>
        <p>To select the best classifier, we performed paired <italic>t</italic> tests on 5×2 cross-validation rather than 10-fold cross-validation. Although performing a paired <italic>t</italic> test on 10-fold cross-validation is a very common practice, we learned that this test is not recommended. The correlation between overlaps violates the <italic>t</italic> test’s assumption of independence, resulting in more false positives (increased type I error); 5×2 cross-validation splits the data set 50/50 and is, therefore, more suitable for statistical analysis. However, 5×2 cross-validation is confined to a small training set, which is why we also used 10 cross-validations to approximate the performance on unseen data.</p>
        <p>Our study is not the first to examine methods for disease identification from EHR [<xref ref-type="bibr" rid="ref3">3</xref>]. Studies have employed high-throughput methods on structured data such as ICD (billing) codes. Regrettably, such codes have a poor performance because they describe why a patient is examined, which does not strictly mean that a patient has that diagnosis. More successful algorithms (often called phenotype algorithms) combined a variety of methods including rule-based case identification and natural language processing [<xref ref-type="bibr" rid="ref2">2</xref>]. Though these algorithms have a good median performance when tested in multiple clinics, on an individual center PPV varies (below 0.5 for several clinics) [<xref ref-type="bibr" rid="ref2">2</xref>]. Moreover, several centers required additional tailoring to allow application of the algorithms. This is not surprising since health clinics have different protocols for registering information.</p>
        <p>As gold standard, we purposely chose the diagnosis of the treating rheumatologist in contrast to counting the disease classification criteria [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. The problem with the latter is that classification criteria have been developed for research and not for clinical practice where all information including additional tests in the differential diagnostic workup are taken into account. Moreover, the exact information for individual criteria is often not precisely registered in EHRs.</p>
        <p>We ran several sensitivity analyses to explore the influence of disease prevalence and number of selected patients on the model's performance. The support vector machine classifier was robust over different selections of training data (low standard error on the cross-validation results), number of training samples, and imbalances of case number. These analyses also showed that in our Leiden data the annotation of 600 patients would have been sufficient to build a reliable classifier. We acknowledge that due to difference in feature variance, the optimal number of patients required to train the classifier might differ between centers.</p>
      </sec>
      <sec>
        <title>Generalizability of the Workflow</title>
        <p>The support vector machine was the best classifier for Leiden-A (F1 score 0.83), although the difference was not significant with respect to the gradient boosting, neural networks, and random forest. The support vector machine was employed in the independent Leiden-B data set with similarly good performance (F1 score 0.81). We predefined 2 thresholds of the rheumatoid arthritis support vector machine probabilities on the first Leiden data (Leiden-A) aiming for either a high precision (PPV 0.94), or a high sensitivity (sensitivity 0.93). When we applied these predefined cut-offs in the second set of patients we obtained similarly high test characteristics (PPV 0.96, sensitivity 0.70, specificity 1.00, NPV 1.00 with the highly precise threshold, and PPV 0.72, sensitivity 0.96, specificity 0.97, NPV 1.00 with the highly sensitive threshold). Finally, we ran the same workflow of training and testing as employed on the Dutch Leiden data to the German Erlangen data. Again, we built a high performing classifier (in this case gradient boosting performed best) that gave consistent results for both settings (PPV 0.97, sensitivity 0.51, specificity 0.99, NPV 0.87 with the highly precise threshold, and PPV 0.72, sensitivity 0.84, specificity 0.90, NPV 0.95 with the highly inclusive threshold).</p>
        <p>The gradient boosting has the best performance in the Erlangen data, while in the Leiden data the support vector machine performs the best. This is not necessarily surprising, as “there is no such thing as a free lunch” (meaning that a universal best algorithm does not exist) [<xref ref-type="bibr" rid="ref19">19</xref>]. The high performance of the support vector machine is achieved by generalizing the Leiden data. There is no guarantee that the technique used in the Leiden data set will also perform the best in the Erlangen data set. Notably, in each data set, both methods performed very well with only very modest differences. The slight deviations in performance between the methods could be caused by language differences and characteristic notations of the center.</p>
        <p>In accordance with the FAIR principles [<xref ref-type="bibr" rid="ref20">20</xref>], we have made all our scripts publicly available and optimized them so scientists may use them regardless of prior experience (<xref rid="figure6" ref-type="fig">Figure 6</xref>) [<xref ref-type="bibr" rid="ref16">16</xref>]. We advise centers not to use our specific classifier but to follow the workflow as presented in this paper and build a classifier that fits the local data best.</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Flowchart describing the steps to apply the machine learning scripts to new data. EHR: electronic health record; MLM: machine learning method; NLP: natural language processing; PPV: positive predictive value; ROC: receiver operating characteristic; TF-IDF: term frequency by inverse document frequency.</p>
          </caption>
          <graphic xlink:href="medinform_v8i11e23930_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>The workflow facilitates the production of highly reliable center-specific machine learning methods for the identification of patients with rheumatoid arthritis from format-free text fields. Our results suggest that our workflow can easily be applied to other EHRs or other diseases and is not restrained by specific language, EHR software, or treatments. This methodology of machine learning for EHR data extraction facilitates cohort studies (with regard to cost and size).</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Overview of study details.</p>
        <media xlink:href="medinform_v8i11e23930_app1.docx" xlink:title="DOCX File , 20 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Average F1 score  for all machine learning methods (Leiden-A).</p>
        <media xlink:href="medinform_v8i11e23930_app2.png" xlink:title="PNG File , 91 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Performance of SVM on increasing training set within the Leiden-A data set.</p>
        <media xlink:href="medinform_v8i11e23930_app3.png" xlink:title="PNG File , 442 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Performance of SVM on increasing prevalence in Leiden-A.</p>
        <media xlink:href="medinform_v8i11e23930_app4.png" xlink:title="PNG File , 256 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Swarm plot depicting gradient boosting–derived probability of being rheumatoid arthritis.</p>
        <media xlink:href="medinform_v8i11e23930_app5.png" xlink:title="PNG File , 315 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUPRC</term>
          <def>
            <p>area under the precision recall curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ICD-9</term>
          <def>
            <p>International Classification of Diseases, Ninth Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NPV</term>
          <def>
            <p>negative predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">PPV</term>
          <def>
            <p>positive predictive value</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by the Dutch Arthritis Association (<italic>ReumaNederland</italic>) 15-3-301 and by Measurement of Efficacy of Treatment in the 'Era of Outcome' in Rheumatology (project number RP 2014-03).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>KP</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gainer</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Goryachev</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng-treitler</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Raychaudhuri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Churchill</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Karlson</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Plenge</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>Electronic medical records for discovery research in rheumatoid arthritis</article-title>
          <source>Arthritis Care Res (Hoboken)</source>
          <year>2010</year>
          <month>08</month>
          <volume>62</volume>
          <issue>8</issue>
          <fpage>1120</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1002/acr.20184"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/acr.20184</pub-id>
          <pub-id pub-id-type="medline">20235204</pub-id>
          <pub-id pub-id-type="pmcid">PMC3121049</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kirby</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Speltz</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rasmussen</surname>
              <given-names>LV</given-names>
            </name>
            <name name-style="western">
              <surname>Basford</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gottesman</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Peissig</surname>
              <given-names>PL</given-names>
            </name>
            <name name-style="western">
              <surname>Pacheco</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Tromp</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pathak</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Carrell</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Ellis</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Lingren</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>WK</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Haines</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Roden</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>PheKB: a catalog and workflow for creating electronic phenotype algorithms for transportability</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2016</year>
          <month>11</month>
          <volume>23</volume>
          <issue>6</issue>
          <fpage>1046</fpage>
          <lpage>1052</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27026615"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv202</pub-id>
          <pub-id pub-id-type="medline">27026615</pub-id>
          <pub-id pub-id-type="pii">ocv202</pub-id>
          <pub-id pub-id-type="pmcid">PMC5070514</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jamian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wheless</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Crofford</surname>
              <given-names>Lj</given-names>
            </name>
            <name name-style="western">
              <surname>Barnado</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Rule-based and machine learning algorithms identify patients with systemic sclerosis accurately in the electronic health record</article-title>
          <source>Arthritis Res Ther</source>
          <year>2019</year>
          <month>12</month>
          <day>30</day>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>305</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arthritis-research.biomedcentral.com/articles/10.1186/s13075-019-2092-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13075-019-2092-7</pub-id>
          <pub-id pub-id-type="medline">31888720</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13075-019-2092-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6937803</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van den Berg</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>van der Heijde</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Landewé</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>van Lambalgen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Huizinga</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>The METEOR initiative: the way forward for optimal, worldwide data integration to improve care for RA patients</article-title>
          <source>Clin Exp Rheumatol</source>
          <year>2014</year>
          <volume>32</volume>
          <issue>5 Suppl 85</issue>
          <fpage>S</fpage>
          <lpage>135</lpage>
          <pub-id pub-id-type="medline">25365103</pub-id>
          <pub-id pub-id-type="pii">8616</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brucher</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perrot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duchesnay</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: machine learning in Python</article-title>
          <source>J Mach Learn Res</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>2825</fpage>
          <lpage>2830</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Raghavan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Schutze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Text classification and naive Bayes</article-title>
          <source>Introduction to Information Retrieval</source>
          <year>2008</year>
          <month>7</month>
          <day>7</day>
          <publisher-loc>Cambridge</publisher-loc>
          <publisher-name>Cambridge University Press</publisher-name>
          <fpage>253</fpage>
          <lpage>289</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lightbody</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Irwin</surname>
              <given-names>GW</given-names>
            </name>
          </person-group>
          <article-title>Multi-layer perceptron based modelling of nonlinear systems</article-title>
          <source>Fuzzy Sets and Systems</source>
          <year>1996</year>
          <month>4</month>
          <day>8</day>
          <volume>79</volume>
          <issue>1</issue>
          <fpage>93</fpage>
          <lpage>112</lpage>
          <pub-id pub-id-type="doi">10.1016/0165-0114(95)00293-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Breiman</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Random forests</article-title>
          <source>Machine Learning</source>
          <year>2001</year>
          <volume>45</volume>
          <fpage>5</fpage>
          <lpage>32</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/content/pdf/10.1023/A:1010933404324.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id>
          <pub-id pub-id-type="medline">21816105</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bennett</surname>
              <given-names>KP</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Support vector machines</article-title>
          <source>SIGKDD Explor Newsl</source>
          <year>2000</year>
          <month>12</month>
          <day>01</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>1</fpage>
          <lpage>13</lpage>
          <pub-id pub-id-type="doi">10.1145/380995.380999</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>JH</given-names>
            </name>
          </person-group>
          <article-title>Greedy function approximation: a gradient boosting machine</article-title>
          <source>Ann Statist</source>
          <year>2001</year>
          <month>10</month>
          <volume>29</volume>
          <issue>5</issue>
          <fpage>1189</fpage>
          <lpage>1232</lpage>
          <pub-id pub-id-type="doi">10.1214/aos/1013203451</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Uysal</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Gunal</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The impact of preprocessing on text classification</article-title>
          <source>Information Processing &#38; Management</source>
          <year>2014</year>
          <month>01</month>
          <volume>50</volume>
          <issue>1</issue>
          <fpage>104</fpage>
          <lpage>112</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ipm.2013.08.006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lozano</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Sensitivity analysis of k-fold cross validation in prediction error estimation</article-title>
          <source>IEEE Trans Pattern Anal Mach Intell</source>
          <year>2010</year>
          <month>03</month>
          <volume>32</volume>
          <issue>3</issue>
          <fpage>569</fpage>
          <lpage>575</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2009.187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saito</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rehmsmeier</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>e0118432</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0118432"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0118432</pub-id>
          <pub-id pub-id-type="medline">25738806</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-26790</pub-id>
          <pub-id pub-id-type="pmcid">PMC4349800</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dietterich</surname>
              <given-names>TG</given-names>
            </name>
          </person-group>
          <article-title>Approximate statistical tests for comparing supervised classification learning algorithms</article-title>
          <source>Neural Comput</source>
          <year>1998</year>
          <month>09</month>
          <day>15</day>
          <volume>10</volume>
          <issue>7</issue>
          <fpage>1895</fpage>
          <lpage>1923</lpage>
          <pub-id pub-id-type="doi">10.1162/089976698300017197</pub-id>
          <pub-id pub-id-type="medline">9744903</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sokolova</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lapalme</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A systematic analysis of performance measures for classification tasks</article-title>
          <source>Information Processing &#38; Management</source>
          <year>2009</year>
          <month>7</month>
          <volume>45</volume>
          <issue>4</issue>
          <fpage>427</fpage>
          <lpage>437</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ipm.2009.03.002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maarseveen</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>DiagnosisExtraction_ML</article-title>
          <source>Github</source>
          <access-date>2020-10-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/levrex/DiagnosisExtraction_ML">https://github.com/levrex/DiagnosisExtraction_ML</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arnett</surname>
              <given-names>FC</given-names>
            </name>
            <name name-style="western">
              <surname>Edworthy</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Bloch</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>McShane</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Fries</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Healey</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Luthra</surname>
              <given-names>HS</given-names>
            </name>
          </person-group>
          <article-title>The American Rheumatism Association 1987 revised criteria for the classification of rheumatoid arthritis</article-title>
          <source>Arthritis Rheum</source>
          <year>1988</year>
          <month>03</month>
          <volume>31</volume>
          <issue>3</issue>
          <fpage>315</fpage>
          <lpage>24</lpage>
          <pub-id pub-id-type="doi">10.1002/art.1780310302</pub-id>
          <pub-id pub-id-type="medline">3358796</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aletaha</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Neogi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Silman</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Funovits</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Felson</surname>
              <given-names>DT</given-names>
            </name>
            <name name-style="western">
              <surname>Bingham</surname>
              <given-names>CO</given-names>
            </name>
            <name name-style="western">
              <surname>Birnbaum</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Burmester</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>Bykerk</surname>
              <given-names>VP</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Combe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Costenbader</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Dougados</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Emery</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ferraccioli</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hazes</surname>
              <given-names>JMW</given-names>
            </name>
            <name name-style="western">
              <surname>Hobbs</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Huizinga</surname>
              <given-names>TWJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kavanaugh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kay</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kvien</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>Laing</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mease</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ménard</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Moreland</surname>
              <given-names>LW</given-names>
            </name>
            <name name-style="western">
              <surname>Naden</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Pincus</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Smolen</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Stanislawska-Biernat</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Symmons</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tak</surname>
              <given-names>PP</given-names>
            </name>
            <name name-style="western">
              <surname>Upchurch</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Vencovský</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wolfe</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hawker</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>2010 Rheumatoid arthritis classification criteria: an American College of Rheumatology/European League Against Rheumatism collaborative initiative</article-title>
          <source>Arthritis Rheum</source>
          <year>2010</year>
          <month>09</month>
          <volume>62</volume>
          <issue>9</issue>
          <fpage>2569</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.doi.org/10.1002/art.27584"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/art.27584</pub-id>
          <pub-id pub-id-type="medline">20872595</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wolpert</surname>
              <given-names>DH</given-names>
            </name>
            <name name-style="western">
              <surname>Macready</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>No free lunch theorems for optimization</article-title>
          <source>IEEE Trans Evol Computat</source>
          <year>1997</year>
          <month>5</month>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>67</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="doi">10.1109/4235.585893</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilkinson</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aalbersberg</surname>
              <given-names>IJJ</given-names>
            </name>
            <name name-style="western">
              <surname>Appleton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Axton</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Baak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Blomberg</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Boiten</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>da</surname>
              <given-names>SSLB</given-names>
            </name>
            <name name-style="western">
              <surname>Bourne</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Bouwman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Brookes</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Crosas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dillo</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Dumon</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Edmunds</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Evelo</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Finkers</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Beltran</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>AJG</given-names>
            </name>
            <name name-style="western">
              <surname>Groth</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Goble</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Grethe</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Heringa</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>'t</surname>
              <given-names>HPAC</given-names>
            </name>
            <name name-style="western">
              <surname>Hooft</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kok</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kok</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lusher</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Martone</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Mons</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Packer</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Persson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rocca-Serra</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Roos</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Sansone</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schultes</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sengstag</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Slater</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Strawn</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Swertz</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van</surname>
              <given-names>DLJ</given-names>
            </name>
            <name name-style="western">
              <surname>van</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Velterop</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Waagmeester</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wittenburg</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wolstencroft</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mons</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>The FAIR Guiding Principles for scientific data management and stewardship</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>03</month>
          <day>15</day>
          <volume>3</volume>
          <issue>160018</issue>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26978244"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.18</pub-id>
          <pub-id pub-id-type="medline">26978244</pub-id>
          <pub-id pub-id-type="pii">sdata201618</pub-id>
          <pub-id pub-id-type="pmcid">PMC4792175</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
