<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i5e14330</article-id>
      <article-id pub-id-type="pmid">32369038</article-id>
      <article-id pub-id-type="doi">10.2196/14330</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Use of Machine Learning Techniques for Case-Detection of Varicella Zoster Using Routinely Collected Textual Ambulatory Records: Pilot Observational Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Bruining</surname>
            <given-names>Nico</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Bajpai</surname>
            <given-names>Ram</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Torii</surname>
            <given-names>Manabu</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Polepalli Ramesh</surname>
            <given-names>Balaji</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Lanera</surname>
            <given-names>Corrado</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0520-7428</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Berchialla</surname>
            <given-names>Paola</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5835-5638</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Baldi</surname>
            <given-names>Ileana</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8578-9164</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lorenzoni</surname>
            <given-names>Giulia</given-names>
          </name>
          <degrees>MA, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1771-4686</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Tramontan</surname>
            <given-names>Lara</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7446-0972</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Scamarcia</surname>
            <given-names>Antonio</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9953-646X</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Cantarutti</surname>
            <given-names>Luigi</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5514-4085</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Giaquinto</surname>
            <given-names>Carlo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9365-0413</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Gregori</surname>
            <given-names>Dario</given-names>
          </name>
          <degrees>MA, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Cardiac Thoracic Vascular Sciences and Public Health</institution>
            <institution>University of Padova</institution>
            <institution>Unit of Biostatistics, Epidemiology and Public Health</institution>
            <addr-line>Via Leonardo Loredan 18</addr-line>
            <addr-line>Padova, 35121</addr-line>
            <country>Italy</country>
            <phone>39 049 827 5384</phone>
            <fax>39 049 827 5407</fax>
            <email>dario.gregori@unipd.it</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7906-0580</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Cardiac Thoracic Vascular Sciences and Public Health</institution>
        <institution>University of Padova</institution>
        <institution>Unit of Biostatistics, Epidemiology and Public Health</institution>
        <addr-line>Padova</addr-line>
        <country>Italy</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Clinical and Biological Science</institution>
        <institution>University of Turin</institution>
        <addr-line>Torino</addr-line>
        <country>Italy</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Arsenàl.IT</institution>
        <addr-line>Treviso</addr-line>
        <country>Italy</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Società Servizi Telematici, Pedianet</institution>
        <addr-line>Padova</addr-line>
        <country>Italy</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Women's and Children's Health</institution>
        <institution>University of Padova</institution>
        <addr-line>Padova</addr-line>
        <country>Italy</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Dario Gregori <email>dario.gregori@unipd.it</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>5</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>5</day>
        <month>5</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>5</issue>
      <elocation-id>e14330</elocation-id>
      <history>
        <date date-type="received">
          <day>10</day>
          <month>4</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>6</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>28</day>
          <month>8</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>16</day>
          <month>12</month>
          <year>2019</year>
        </date>
      </history>
      <copyright-statement>©Corrado Lanera, Paola Berchialla, Ileana Baldi, Giulia Lorenzoni, Lara Tramontan, Antonio Scamarcia, Luigi Cantarutti, Carlo Giaquinto, Dario Gregori. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 05.05.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2020/5/e14330" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The detection of infectious diseases through the analysis of free text on electronic health reports (EHRs) can provide prompt and accurate background information for the implementation of preventative measures, such as advertising and monitoring the effectiveness of vaccination campaigns.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The purpose of this paper is to compare machine learning techniques in their application to EHR analysis for disease detection.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The Pedianet database was used as a data source for a real-world scenario on the identification of cases of varicella. The models’ training and test sets were based on two different Italian regions’ (Veneto and Sicilia) data sets of 7631 patients and 1,230,355 records, and 2347 patients and 569,926 records, respectively, for whom a gold standard of varicella diagnosis was available. Elastic-net regularized generalized linear model (GLMNet), maximum entropy (MAXENT), and LogitBoost (boosting) algorithms were implemented in a supervised environment and 5-fold cross-validated. The document-term matrix generated by the training set involves a dictionary of 1,871,532 tokens. The analysis was conducted on a subset of 29,096 tokens, corresponding to a matrix with no more than a 99% sparsity ratio.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The highest predictive values were achieved through boosting (positive predicative value [PPV] 63.1, 95% CI 42.7-83.5 and negative predicative value [NPV] 98.8, 95% CI 98.3-99.3). GLMNet delivered superior predictive capability compared to MAXENT (PPV 24.5% and NPV 98.3% vs PPV 11.0% and NPV 98.0%). MAXENT and GLMNet predictions weakly agree with each other (agreement coefficient 1 [AC1]=0.60, 95% CI 0.58-0.62), as well as with LogitBoost (MAXENT: AC1=0.64, 95% CI 0.63-0.66 and GLMNet: AC1=0.53, 95% CI 0.51-0.55).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Boosting has demonstrated promising performance in large-scale EHR-based infectious disease identification.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>machine learning technique</kwd>
        <kwd>text mining</kwd>
        <kwd>electronic health report</kwd>
        <kwd>varicella zoster</kwd>
        <kwd>pediatric infectious disease</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Improving the predictive capability of infectious disease detection at the population level is an important public health issue that can provide the background information necessary for the implementation of effective control strategies, such as advertising and monitoring the effectiveness of vaccination campaigns [<xref ref-type="bibr" rid="ref1">1</xref>].</p>
      <p>The need for fast, cost-effective, and accurate detection of infection rates has been widely investigated in recent literature [<xref ref-type="bibr" rid="ref2">2</xref>]. Particularly, the combination of increased electronic health report (EHR) implementation in primary care, the growing availability of digital information within the EHR, and the development of data mining techniques offer great promise for accelerating pediatric infectious disease research [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>Although EHR data are collected prospectively in real time at the point of health care delivery, observational studies intended to retrospectively assess the impact of clinical decisions are likely the most common type of EHR-enabled research [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>Among the high-impact diseases, the prompt identification of varicella zoster viral infections is of key interest due to the debate around the need and cost-benefit dynamics of a mass-vaccination program for young children [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p>
      <p>Challenges in this context arise from both the unique epidemiological characteristics of varicella zoster with respect to information extraction, such as age-specific consultation rates, seasonality, force of infection, hospitalization rates, and inpatient days [<xref ref-type="bibr" rid="ref6">6</xref>], and from the way that medical records are organized, often in free-format and uncoded fields [<xref ref-type="bibr" rid="ref7">7</xref>]. A critical step is to transform this large amount of health care data into knowledge.</p>
      <p>Data extraction from free text for disease detection at the individual level can be based on manual, in-depth examinations of individual medical records or, to contain costs and ensure time-tightening and control, by automatic coding. Machine learning techniques (MLTs) are the most commonly used approaches [<xref ref-type="bibr" rid="ref8">8</xref>] and show good overall performance [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Nevertheless, few indications are currently available on the most appropriate technique to use, and comparative evidence is still lacking on the performances of each available technique [<xref ref-type="bibr" rid="ref11">11</xref>] in the field of pediatric infectious disease research.</p>
      <p>In recent years, generalized linear model (GLM)-based techniques have been largely used for the text mining of EHRs, both as a technique of choice [<xref ref-type="bibr" rid="ref12">12</xref>] and as a benchmark [<xref ref-type="bibr" rid="ref13">13</xref>]. The performance of GLMs, especially multinomial or in the simplest cases logistic regression, has been indicated as unsatisfactory [<xref ref-type="bibr" rid="ref14">14</xref>] because they are prone to overfitting and are sensitive to outliers. Enhancements to GLMs have been proposed recently in the form of the lasso and elastic-net regularized GLM [<xref ref-type="bibr" rid="ref15">15</xref>] (GLMNet), multinomial logistic regression (maximum entropy [MAXENT]), and the boosting approach implemented in the LogitBoost algorithm [<xref ref-type="bibr" rid="ref16">16</xref>] to overcome the limitations of naïve GLMs. Nevertheless, to the best of our knowledge, no comparisons have been made among these techniques to determine to what extent improvements are needed.</p>
      <p>The purpose of this study is to make comparisons among enhanced GLM techniques in the setting of automatic disease detection [<xref ref-type="bibr" rid="ref17">17</xref>]. Particularly, these methods will be assessed on their ability of identifying cases of varicella from a large set of EHRs.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Electronic Medical Record Database</title>
        <p>The Italian Pedianet database [<xref ref-type="bibr" rid="ref18">18</xref>] collects anonymized clinical data from more than 300 pediatricians throughout the country. This database focuses on children 0-14 years of age [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref22">22</xref>] and records the reasons for accessing health care, diagnosis, and clinical details. The sources of those data are primary care records written in Italian, which are filled in by pediatricians with clinical details about diagnosis and prescriptions; they also contain details about the eventual hospitalization and specialist referrals.</p>
        <p>For the purpose of this study, we were allowed to access only two subsets of the Pedianet database, corresponding to the data collected between 2004 and 2014 in the Italian regions of Veneto (northern Italy) and Sicilia (South Italy). Since the Veneto region data set was larger, it was considered for carrying out the training of the model. The data set of the Sicilia region provided an independent data set for testing the model. The main characteristics of the two data sets are reported in <xref ref-type="table" rid="table1">Table 1</xref>. It is worth noting that the proportion of positive cases of varicella is different in the two databases. Interpreting differences in prevalence between regions is beyond the purpose of this study; nevertheless, given the smaller prevalence, there is an expected lower positive predictive value (PPV) and a higher negative predictive value (NPV) on the test set.</p>
        <p>The Pedianet source data includes five different tables. In <xref ref-type="table" rid="table2">Table 2</xref>, we report a short description of them.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Main characteristics used for the train (Veneto) and test (Sicilia) data sets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="370"/>
            <col width="380"/>
            <thead>
              <tr valign="top">
                <td>Characteristic</td>
                <td>Train</td>
                <td>Test</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Database</td>
                <td>Pedianet</td>
                <td>Pedianet</td>
              </tr>
              <tr valign="top">
                <td>Language</td>
                <td>Italian</td>
                <td>Italian</td>
              </tr>
              <tr valign="top">
                <td>Italian Region</td>
                <td>Veneto</td>
                <td>Sicilia</td>
              </tr>
              <tr valign="top">
                <td>Date span</td>
                <td>January 2, 2004-December 31, 2014</td>
                <td>January 7, 2004-December 30, 2014</td>
              </tr>
              <tr valign="top">
                <td>Records, n</td>
                <td>1,230,355</td>
                <td>569,926</td>
              </tr>
              <tr valign="top">
                <td>Children, n</td>
                <td>7631</td>
                <td>2347</td>
              </tr>
              <tr valign="top">
                <td>Pediatricians, n</td>
                <td>46</td>
                <td>13</td>
              </tr>
              <tr valign="top">
                <td>Positive cases, n (%)</td>
                <td>3481 (45.6%)</td>
                <td>128 (5.4%)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Tables used from the Pedianet database.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="290"/>
            <col width="120"/>
            <col width="440"/>
            <thead>
              <tr valign="top">
                <td>Table topic</td>
                <td>Content</td>
                <td>Type of data</td>
                <td>Example</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Accessing</td>
                <td>Reasons for accessing the pediatrician and diagnoses</td>
                <td>Free text (including codes)</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Ritardo di crescita &#60;783.4&#62;</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Diaries</td>
                <td>Pediatrician’s free-text diaries</td>
                <td>Free text</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>DIBASE OS GTT 10ML 10000UI/ML n° conf. 2\r\n per Visita di controllo e di follow up\r\n\r\n</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Hospitalizations</td>
                <td>Details on hospital admissions, diagnoses, and length of stays</td>
                <td>Free text</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Divisione di pediatria</p>
                    </list-item>
                    <list-item>
                      <p>Tosse, difficolta' respiratoria e di alimentazione</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>SOAP<sup>a</sup></td>
                <td>Symptoms, objectivity, diagnosis, or prescriptions</td>
                <td>Free text (including codes)</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p><italic>SOAP</italic><sup>b</sup>: “P”,</p>
                    </list-item>
                    <list-item>
                      <p><italic>SOAP_code</italic>: “77469”,</p>
                    </list-item>
                    <list-item>
                      <p><italic>SOAP_text</italic>: “visita otorinolaringoiatrica&#60;89.7&#62;”</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Specialistic visits</td>
                <td>Visit type and its diagnosis</td>
                <td>Free text including (codes)</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p><italic>codice_visitaSP</italic>: “89.01”,</p>
                    </list-item>
                    <list-item>
                      <p><italic>visita</italic>: “ecografia anche sec. Graaf per screening”,</p>
                    </list-item>
                    <list-item>
                      <p><italic>diagnosi</italic>: “problemi della vista &#60;V41.0&#62;”</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>SOAP: symptoms, objectivity, diagnosis, or prescriptions.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>For tables with multiple fields, field names are reported in italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>All the tables can be linked at the individual level (ie, each row of all the tables contains the fields for reporting information on dates, the assisting pediatrician’s anonymous identifier, and the patients’ anonymous identifier, which constitutes the linking key).</p>
      </sec>
      <sec>
        <title>Case Definition</title>
        <p>The case definition comes directly from the gold standard provided, and the training set for machine learning was created using those dichotomous labels (ie, 0=noncase, that is not a varicella case; and 1=case, that is a varicella case).</p>
      </sec>
      <sec>
        <title>Training and Test Sets for Machine Learning</title>
        <p>Linking by patient ID, pediatrician ID, and reporting date, we merged the five tables into a single table consisting of several entries, each of which represents a visit or evaluation of a patient carried out by a pediatrician on a specific day. At this step, the information (excluding patient ID, pediatrician ID, and reporting date) is contained in 15 columns containing free text mixed with coded text, which was considered by us as free text as well. Finally, all remaining columns of the table were merged into a single corpus (ie, a body of text). This process was applied to train the models on 1,230,355 entries (database of the Veneto region) and to test them on 569,926 entries (database of the Sicily region) separately.</p>
      </sec>
      <sec>
        <title>Preprocessing</title>
        <p>Text analysis by a computer program is possible only after establishing a way to convert text (ie, readable to humans) into numbers (ie, readable to computers). This process is called preprocessing, and it is the first [<xref ref-type="bibr" rid="ref23">23</xref>] and probably the most important step in data mining [<xref ref-type="bibr" rid="ref24">24</xref>]. To process the corpus of Pedianet EHRs included in the training set, we used the following strategy. First, we converted all fields in a text type; lowered the content; and cleared it of symbols, punctuation, numbers, and extra white spaces. Second, we stemmed the words (ie, reducing them to their basic form, or “root”), which is recognized as one of the most important procedures to perform [<xref ref-type="bibr" rid="ref25">25</xref>], and constructed 2-gram tokens, which has been shown to be the optimal rank for gram tokenization [<xref ref-type="bibr" rid="ref26">26</xref>]. Third, we removed all the (stemmed) stop words (ie, common and nonmeaningful words such as articles or conjunctions) from the set of tokens as well as all bigrams containing any of them. We chose this strategy after exploring different approaches described in [<xref ref-type="bibr" rid="ref27">27</xref>]. Fourth, we created the document-term matrix (DTM) as a patient-token matrix. To consider both the importance of the tokens within a patient (ie, one row of the DTM) and its discrimination power between patients’ records (ie, the rows of the DTM), we computed the TF-iDF (term frequencies–inverse document frequencies) weights. TF-iDF weights help to adjust for the presence of words that are more frequent but less meaningful [<xref ref-type="bibr" rid="ref28">28</xref>]. TF-iDF-ij entry is equal to the product of the frequency of the j-th token in the i-th document by the logarithm of the inverse of the number of documents that contain that token (ie, the more frequent a word appears in a document the more its weight rises for that document), and the more documents that contain the j-th token, the more the weight shrinks across all the documents [<xref ref-type="bibr" rid="ref29">29</xref>]. In the initial DTM there were 1,871,532 tokens that appear at least once, with a nonsparse/sparse entries ratio of (18,951,304/14,262,709,388). We decided to reduce it to achieve a maximum of 99% overall sparsity. Filtering out the tokens that do not appear in at least 1% of the documents had reduced it down to 94% (ie, 29,096 tokens that appear at least once for a nonsparse/sparse entries ratio of 13,140,370/208,891,206). The choice of a 99% level of sparsity was a tradeoff between the need to retain as many tokens as possible and the computational effort.</p>
        <p>The corpus of Pedianet EHRs comprised in the test set went through the same text preprocessing strategy in the same order, and then the DTM was created with the initial TF weighing scheme. Furthermore, it was adapted with the same tokens retained in the training phase (ie, adding the missing tokens, weighting them as zero, and removing the ones not included in the training DTM) and was finally reweighted with the TF-iDF weighing scheme with the same retained iDF weights of the corresponding training DTM, which were retained when applied to the whole training data set. Those are necessary steps to guarantee that the two feature spaces are the same and that the models trained can be evaluated on the test set.</p>
      </sec>
      <sec>
        <title>Machine Learning Techniques</title>
        <p>Enhancements of GLMs for carrying out text mining on EHRs have been proposed in the form of the lasso and GLMNet [<xref ref-type="bibr" rid="ref16">16</xref>], multinomial logistic regression (MAXENT), and the boosting approach (LogitBoost) [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
        <p>GLMNet is a regularized regression method that linearly combines the L1 and L2 penalties of the lasso and ridge methods applied in synergy with a link function and a variance function to overcome linear model limitations (eg, the constant variability among the mean and the normality of the data). The link function selected was the binomial (ie, the model fit a regularized logistic regression model for the log odds), while the amount of regularization was automatically selected by the algorithm through an exploration of 100 values between the minimum value that reduced all the coefficients to zero and its 0.01 fraction.</p>
        <p>MAXENT is an implementation of (multinomial) logistic regression aimed at minimizing the memory load on large data sets in R (R Foundation for Statistical Computing) and is primarily designed to work with the sparse DTM provided by the R package [<xref ref-type="bibr" rid="ref30">30</xref>]. It has been proven to provide results mathematically equivalent to a GLM with a Poisson link function [<xref ref-type="bibr" rid="ref31">31</xref>].</p>
        <p>Boosting is a general approach for improving the predictive capability of any given learning algorithm. We used the adaptations of Tuszynski [<xref ref-type="bibr" rid="ref32">32</xref>] to the original algorithm, (ie, LogitBoost [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]), which is aimed at making the entire process more efficient while applying it on large data sets. The standard boosting technique [<xref ref-type="bibr" rid="ref34">34</xref>] is applied to the sequential use of a decision stump classification algorithm as a weak learner (ie, a single binary decision tree). The number of stumps considered is the same as the columns provided in the training set.</p>
        <p>Those techniques are chosen among computationally treatable algorithms for use with large data sets [<xref ref-type="bibr" rid="ref30">30</xref>]. GLMNet and MAXENT represent classical benchmark approaches to linear and logistic classification, respectively, in a manner that differs from LogitBoost, which is a modern boosted tree-based machine learning approach [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Moreover, LogitBoost generalizes the classical logistic models by fitting a logistic model at each node [<xref ref-type="bibr" rid="ref37">37</xref>] and shows an alternative point of view with regards to models such as the GLMs, for which the structure of the learner must be chosen a priori [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
      </sec>
      <sec>
        <title>Training and Testing</title>
        <p>We addressed the issue of internal validation by performing cross-validation on the training set comprising records from the Veneto region. We dealt with external validation by accessing a truly external sample of Pedianet EHRs from another Italian region, Sicily. This accomplishes two tasks: preserving precision in the training phase and complementing study findings with external validation results using data that were not available when the predictive tool was developed.</p>
        <p>We used a 5-fold cross-validation approach to validate each of the three MLTs on the DTM with the corresponding (by row) “case/non-case” attached labels. All MLTs were simultaneously fitted on the same set of folds to ensure a proper comparison between techniques. Values of k=10 or k=5 (especially for large data sets) have been shown empirically to yield acceptable (in terms of bias-variance trade-off) error rates [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Thus, the choice of 5-folds was driven by the computational complexity, the fewer folds, the less complexity.</p>
        <p>As measures of performance, we calculated point estimates and 95% CIs for the following.</p>
        <list list-type="bullet">
          <list-item>
            <p>PPV or Precision: <inline-graphic xlink:href="medinform_v8i5e14330_fig2.png" xlink:type="simple" mimetype="image"/>, that is the fraction of positively identified cases that are true positives</p>
          </list-item>
          <list-item>
            <p>NPV: <inline-graphic xlink:href="medinform_v8i5e14330_fig3.png" xlink:type="simple" mimetype="image"/>, that is the fraction of positively identified noncases that are true negatives</p>
          </list-item>
          <list-item>
            <p>Sensitivity or Recall: <inline-graphic xlink:href="medinform_v8i5e14330_fig4.png" xlink:type="simple" mimetype="image"/>, that is the true positive rate</p>
          </list-item>
          <list-item>
            <p>Specificity: <inline-graphic xlink:href="medinform_v8i5e14330_fig5.png" xlink:type="simple" mimetype="image"/>, that is the true negative rate</p>
          </list-item>
          <list-item>
            <p>F score: <inline-graphic xlink:href="medinform_v8i5e14330_fig6.png" xlink:type="simple" mimetype="image"/>, the harmonic mean of the PPV (Precision) and Sensitivity (Recall)</p>
          </list-item>
        </list>
        <p>The Gwet agreement coefficient 1 (AC1) statistics of agreement [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>] between the techniques were computed and reported, along with their corresponding 95% CIs. Given that A=the number of times both models classify a record as noncase, D=the number of times both models classify a record as a case, and N=the total sample size, then <inline-graphic xlink:href="medinform_v8i5e14330_fig7.png" xlink:type="simple" mimetype="image"/>, where <inline-graphic xlink:href="medinform_v8i5e14330_fig8.png" xlink:type="simple" mimetype="image"/>, and <italic>e</italic><sup>γ</sup> is the agreement probability by chance and is equal to 2<italic>q</italic> (1 – <italic>q</italic>), where <inline-graphic xlink:href="medinform_v8i5e14330_fig9.png" xlink:type="simple" mimetype="image"/>; A1 is the number of records classified as noncase by model 1, and B1 is the number of records classified as noncase by model 2. AC1 has been used given its propensity to be weakly affected by marginal probability, and therefore it was chosen to manage unbalanced data [<xref ref-type="bibr" rid="ref43">43</xref>].</p>
        <p>All the analyses were implemented in the R system [<xref ref-type="bibr" rid="ref44">44</xref>] with the computing facilities of the Unit of Biostatistics, Epidemiology and Public Health. The R packages used were: <italic>SnowballC</italic> (to stem the words) and <italic>RWeka</italic> (to create n-grams) for the preprocessing step; <italic>Matrix</italic> and <italic>SparseM</italic> to manage sparse matrices; <italic>GLMNet</italic>, <italic>MAXENT,</italic> and <italic>caTools</italic> for the GLMNet, MAXENT, and LogitBoost MLT implementation; <italic>caret</italic> to create and evaluate the cross-validation folds; <italic>ROCR</italic> to estimate the performance; and the <italic>tidyverse</italic> bundle of packages for data management, functional programming, and plots. A git repository of the analysis code is available [<xref ref-type="bibr" rid="ref45">45</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The flow chart, from data acquisition to preprocessing, is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. In the training set, 29,096 initial terms out of 1,871,532 were retained by the sparsity reduction step. Boosting significantly outperforms all other MLTs on the training set, with the highest <italic>F</italic> score and PPV. The GLMNet predictor delivered a superior <italic>F</italic> score and greater PPV compared to MAXENT (<xref ref-type="table" rid="table3">Table 3</xref>). The same results held on the test set (<xref ref-type="table" rid="table4">Table 4</xref>) and agreement between MLT predictions on the training set was good as measured by AC1 statistics (<xref ref-type="table" rid="table5">Table 5</xref>).</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Flowchart from the acquisition of the five tables containing the electronic health records (dark gray) in the training set that were merged into a single table (dark blue); preprocessed (gray) with the specification of what was removed (pink) prior to the creation of the document-term matrix (DTM) (yellow); the computation of the weights (light blue); the dimensionality reduction, that is the reduction of the terms used (light gray), and the final DTM used (green). DTM: document-term matrix; SOAP: symptoms, objectivity, diagnosis, or prescriptions; TF-iDF: term frequencies–inverse document frequencies.</p>
        </caption>
        <graphic xlink:href="medinform_v8i5e14330_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Performance on the training set of the three machine learning techniques using a 5-fold cross-validation method.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="100"/>
          <col width="200"/>
          <col width="160"/>
          <col width="160"/>
          <col width="200"/>
          <col width="180"/>
          <thead>
            <tr valign="bottom">
              <td>Technique</td>
              <td>Sensitivity, mean (95% CI)</td>
              <td>PPV<sup>a</sup>, mean (95% CI)</td>
              <td>NPV<sup>b</sup>, mean (95% CI)</td>
              <td>Specificity, mean (95% CI)</td>
              <td><italic>F</italic> score, mean (95% CI)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>GLMNet<sup>c</sup></td>
              <td>80.2 (77.7-82.7)</td>
              <td>73.2 (70.9-75.6)</td>
              <td>90.9 (89.6-92.2)</td>
              <td>87.1 (85.6-88.7)</td>
              <td>76.5 (75.6-77.5)</td>
            </tr>
            <tr valign="top">
              <td>MAXENT<sup>d</sup></td>
              <td>68.8 (66.8-70.7)</td>
              <td>66.0 (62.5-69.5)</td>
              <td>86.1 (85.2-86.9)</td>
              <td>84.5 (82.7-86.3)</td>
              <td>67.4 (64.7-70.0)</td>
            </tr>
            <tr valign="top">
              <td>Boosting</td>
              <td>86.6 (82.1-91.1)</td>
              <td>95.8 (93.2-98.5)</td>
              <td>94.4 (92.4-96.3)</td>
              <td>98.3 (97.0-99.6)</td>
              <td>90.9 (89.7-92.1)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><sup>a</sup>PPV: positive predicative value.</p>
          </fn>
          <fn id="table3fn2">
            <p><sup>b</sup>NPV: negative predicative value.</p>
          </fn>
          <fn id="table3fn3">
            <p><sup>c</sup>GLMNet: elastic-net regularized generalized linear model.</p>
          </fn>
          <fn id="table3fn4">
            <p><sup>d</sup>MAXENT: maximum entropy.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table4">
        <label>Table 4</label>
        <caption>
          <p>Performance on the test set of the three machine learning techniques under consideration.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="100"/>
          <col width="200"/>
          <col width="160"/>
          <col width="160"/>
          <col width="200"/>
          <col width="180"/>
          <thead>
            <tr valign="bottom">
              <td>Technique</td>
              <td>Sensitivity, mean (95% CI)</td>
              <td>PPV<sup>a</sup>, mean (95% CI)</td>
              <td>NPV<sup>b</sup>, mean (95% CI)</td>
              <td>Specificity, mean (95% CI)</td>
              <td><italic>F</italic> score, mean (95% CI)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>GLMNet<sup>c</sup></td>
              <td>72.3 (66.4-78.1)</td>
              <td>24.5 (21.0-28.0)</td>
              <td>98.3 (97.9-98.6)</td>
              <td>87.4 (85.4-89.5)</td>
              <td>36.5 (32.2-40.8)</td>
            </tr>
            <tr valign="top">
              <td>MAXENT<sup>d</sup></td>
              <td>74.8 (62.2-87.5)</td>
              <td>11.0 (9.5-12.5)</td>
              <td>98.0 (97.3-98.6)</td>
              <td>65.5 (54.7-76.2)</td>
              <td>19.1 (17.2-20.9)</td>
            </tr>
            <tr valign="top">
              <td>Boosting</td>
              <td>79.2 (69.7-88.7)</td>
              <td>63.1 (42.7-83.5)</td>
              <td>98.8 (98.3-99.3)</td>
              <td>96.9 (94.2-99.6)</td>
              <td>68.5 (59.3-77.7)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table4fn1">
            <p><sup>a</sup>PPV: positive predicative value.</p>
          </fn>
          <fn id="table4fn2">
            <p><sup>b</sup>NPV: negative predicative value.</p>
          </fn>
          <fn id="table4fn3">
            <p><sup>c</sup>GLMNet: elastic-net regularized generalized linear model.</p>
          </fn>
          <fn id="table4fn4">
            <p><sup>d</sup>MAXENT: maximum entropy.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table5">
        <label>Table 5</label>
        <caption>
          <p>Agreement between elastic-net regularized generalized linear model, maximum entropy, and boosting using 5-fold cross-validation.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="260"/>
          <col width="180"/>
          <col width="190"/>
          <col width="140"/>
          <col width="230"/>
          <thead>
            <tr valign="bottom">
              <td>Technique</td>
              <td>Wrongly agree<sup>a</sup>, n</td>
              <td>Correctly agree<sup>b</sup>, n</td>
              <td>Disagree<sup>c</sup>, n</td>
              <td>Gwet AC1<sup>d,e</sup> (95% CI)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="bottom">
              <td>GLMNet<sup>f</sup> vs MAXENT<sup>g</sup></td>
              <td>669</td>
              <td>5609</td>
              <td>1353</td>
              <td>0.68 (0.67-0.70)</td>
            </tr>
            <tr valign="top">
              <td>GLMNet vs boosting</td>
              <td>195</td>
              <td>6269</td>
              <td>1146</td>
              <td>0.74 (0.72-0.75)</td>
            </tr>
            <tr valign="top">
              <td>MAXENT vs boosting</td>
              <td>224</td>
              <td>5895</td>
              <td>1491</td>
              <td>0.66 (0.65-0.68)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table5fn1">
            <p><sup>a</sup>The “Wrongly Agree” column refers to the number of records misclassified by both techniques.</p>
          </fn>
          <fn id="table5fn2">
            <p><sup>b</sup>The “Correctly Agree” column states the number of records correctly classified by both techniques.</p>
          </fn>
          <fn id="table5fn3">
            <p><sup>c</sup>The “Disagree” column lists the number of records for which the techniques disagree in the classification.</p>
          </fn>
          <fn id="table5fn4">
            <p><sup>d</sup>AC1: agreement coefficient 1.</p>
          </fn>
          <fn id="table5fn5">
            <p><sup>e</sup>Gwet AC1 represents the index of agreement between the identified techniques. Legend for AC1 is: AC1&#60;0=disagreement; AC1 0.00-0.40=poor; AC1 0.41-0.60=discrete; AC1 0.61-0.80=good; AC1 0.81-1.00=optimal.</p>
          </fn>
          <fn id="table5fn6">
            <p><sup>f</sup>GLMNet: elastic-net regularized generalized linear model.</p>
          </fn>
          <fn id="table5fn7">
            <p><sup>g</sup>MAXENT: maximum entropy.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>With the aim to analyze the most relevant errors, we explored if any records were wrongly classified by all the techniques. There were 3 records: 1 wrongly classified as positive and 2 wrongly classified as negatives by all the MLTs.</p>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The application of MLTs to EHRs constitutes the analytical component of an emerging research paradigm that rests on the capture and preprocessing of massive amounts of clinical data to gain clinical insights and ideally to complement the decision-making process at different levels, from individual treatment to definition of national public health policies. As acknowledged by others [<xref ref-type="bibr" rid="ref46">46</xref>], the development and application of big data analysis methods on EHRs may help create a continually learning health care system [<xref ref-type="bibr" rid="ref47">47</xref>].</p>
        <p>This study trains and compares three different machine learning approaches towards infectious disease detection at the population level based on clinical data collected in primary care EHRs. In line with the recommended paradigm for model validation [<xref ref-type="bibr" rid="ref39">39</xref>], the MLTs’ performance underwent internal validation through cross-validation and external validation on an independent set of EHRs.</p>
        <p>The predictive capabilities of the developed MLTs are promising even if quite different from each other (eg, validation <italic>F</italic> scores range from 67%-91% and test <italic>F</italic> scores range from 19%-69%). Findings on the better performance reached by LogitBoost are in line with recent evidence that shows an improvement in general classification problems moving from MAXENT algorithms to LogitBoost-based ones [<xref ref-type="bibr" rid="ref48">48</xref>]. LogitBoost is thus confirmed to be a useful technique for solving health-related classification problems [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
        <p>Only three records were wrongly classified by all the models. The first one was wrongly classified as positive probably because the text entry was “vaccini:varicella e mpr” (ie, vaccine: varicella and mpr), and after the preprocessing, the bigram “vaccin varicell” was removed because the TFiDF weight was low. Thus the relationship between varicella and vaccine was lost and remained only the token “varicell”.</p>
        <p>The other two records were wrongly classified as negative. For one of them, the misclassification was probably due to an issue in the tokenization. In fact, an anomalous sequence of dashes (“-”) and blanks lead to the token “- varicella”, which was removed from the feature space, leaving no reference to the disease. The second negative misclassified record referred to a child who was vaccinated for measles, mumps, rubella, and varicella (quadrivalent vaccine). The pediatrician wrote “vaccinazione morbillo parotite rosolia varicella” (ie, vaccination, measles, mumps, rubella, varicella). The bigram “rosol varicell” (ie, “rubell varicell”) was weighted 0.361 and, hence, was retained in the feature space, and was considered by all the MLTs a pattern of noninfection.</p>
        <p>The strength of tree-based models such as LogitBoost also lies in their high scalability. In fact, their computational complexity (ie, the asymptotical time needed for a complete run) grows linearly with the sample size and quadratically with the number of features used (ie, the number of tokens considered) [<xref ref-type="bibr" rid="ref37">37</xref>]. Assuming that the richness of the pediatric EHRs’ vocabulary is limited (ie, the number of tokens reaches a plateau as data accumulates over time) an increase in computational time will only depend linearly on the number of patients.</p>
        <p>Any attempt to use EHRs to identify patients with a specific disease would depend on the algorithm, the database, the language, and the true prevalence of the disease. As to the generalization of these models to other contexts, we hypothesize that they could also be successfully applied in public health systems with EHR charting in other languages [<xref ref-type="bibr" rid="ref49">49</xref>].</p>
        <p>We acknowledge that one metric (ie, sensitivity, specificity, PPV, or NPV) may be more important than another, depending on the intended use of the classification algorithm. Thus, the LogitBoost model is adequate for ascertaining varicella cases, with a preference for case identification with good sensitivity and excellent specificity.</p>
        <p>If the aim of using MLTs is to help create a gold standard for databases, the limited agreement between the MLTs reported in <xref ref-type="table" rid="table5">Table 5</xref> suggests that these classification algorithms are not reliable as a set of annotators.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Some limitations must be acknowledged. First, it is acknowledged that text preprocessing is a crucial step. The way to convert free text into numbers and numbers into features is an essential step of the process and has one of the biggest impacts on the results [<xref ref-type="bibr" rid="ref24">24</xref>]. For the same reason as before, we decided to follow a standard preprocessing procedure without searching for the best one to obtain results that are, at most, independent of human tuning.</p>
        <p>Second, we set the number of boosting iterations as the same number of features considered. This is suboptimal in computational time because the same performance can be reached with fewer iterations [<xref ref-type="bibr" rid="ref37">37</xref>]. Nevertheless, we aimed to reach an upper-bound value for the performance estimated in an optimal situation.</p>
        <p>Third, the large difference in disease prevalence between the training and the validation data set should be noted. The boosting approach seems to deal with this issue in a satisfactory way, but a potential impact on model prediction could not be excluded.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Given their promising performance in identifying varicella cases, LogitBoost, and MLTs in general, could be effectively used for large-scale surveillance, minimizing time and cost in a scalable and reproducible manner.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AC1</term>
          <def>
            <p>agreement coefficient 1</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">DTM</term>
          <def>
            <p>document-term matrix</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EHR</term>
          <def>
            <p>electronic health report</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">GLM</term>
          <def>
            <p>generalized linear model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">GLMNet</term>
          <def>
            <p>elastic-net regularized generalized linear model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MAXENT</term>
          <def>
            <p>maximum entropy</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">MLT</term>
          <def>
            <p>machine learning technique</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">NPV</term>
          <def>
            <p>negative predicative value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">PPV</term>
          <def>
            <p>positive predicative value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">TF-iDF</term>
          <def>
            <p>term frequencies–inverse document frequencies</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The data that support the findings of this study are available from Pedianet, but restrictions apply to the availability of these data, which were used under license for this study and are not publicly available. Data are, however, available from the authors upon reasonable request and with the permission of Pedianet.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>CL, CG, and DG designed the study. CL and PB performed the analysis. CL, PB, IB, and GL wrote the manuscript. IB and DG interpreted the statistical results. GL and CG interpreted the clinical results. LT, AS, and LG handled data management.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Magill</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Dumyati</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Fridkin</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>Evaluating epidemiology and improving surveillance of infections associated with health care, United States</article-title>
          <source>Emerg Infect Dis</source>
          <year>2015</year>
          <month>09</month>
          <volume>21</volume>
          <issue>9</issue>
          <fpage>1537</fpage>
          <lpage>42</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.doi.org/10.3201/eid2109.150508"/>
          </comment>
          <pub-id pub-id-type="doi">10.3201/eid2109.150508</pub-id>
          <pub-id pub-id-type="medline">26291035</pub-id>
          <pub-id pub-id-type="pmcid">PMC4550137</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lloyd-Smith</surname>
              <given-names>JO</given-names>
            </name>
            <name name-style="western">
              <surname>Funk</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McLean</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Nine challenges in modelling the emergence of novel pathogens</article-title>
          <source>Epidemics</source>
          <year>2015</year>
          <month>03</month>
          <volume>10</volume>
          <fpage>35</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1755-4365(14)00050-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.epidem.2014.09.002</pub-id>
          <pub-id pub-id-type="medline">25843380</pub-id>
          <pub-id pub-id-type="pii">S1755-4365(14)00050-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC4715032</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sutherland</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Kaelber</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Downing</surname>
              <given-names>NL</given-names>
            </name>
            <name name-style="western">
              <surname>Goel</surname>
              <given-names>VV</given-names>
            </name>
            <name name-style="western">
              <surname>Longhurst</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>Electronic health record-enabled research in children using the electronic health record for clinical discovery</article-title>
          <source>Pediatr Clin North Am</source>
          <year>2016</year>
          <month>04</month>
          <volume>63</volume>
          <issue>2</issue>
          <fpage>251</fpage>
          <lpage>68</lpage>
          <pub-id pub-id-type="doi">10.1016/j.pcl.2015.12.002</pub-id>
          <pub-id pub-id-type="medline">27017033</pub-id>
          <pub-id pub-id-type="pii">S0031-3955(15)00203-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baracco</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Eisert</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Saavedra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hirsch</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Marin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ortega-Sanchez</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Clinical and economic impact of various strategies for varicella immunity screening and vaccination of health care personnel</article-title>
          <source>Am J Infect Control</source>
          <year>2015</year>
          <month>10</month>
          <day>01</day>
          <volume>43</volume>
          <issue>10</issue>
          <fpage>1053</fpage>
          <lpage>60</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ajic.2015.05.027</pub-id>
          <pub-id pub-id-type="medline">26138999</pub-id>
          <pub-id pub-id-type="pii">S0196-6553(15)00616-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Damm</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Ultsch</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Horn</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolajczyk</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Greiner</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wichmann</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Systematic review of models assessing the economic value of routine varicella and herpes zoster vaccination in high-income countries</article-title>
          <source>BMC Public Health</source>
          <year>2015</year>
          <month>06</month>
          <day>05</day>
          <volume>15</volume>
          <fpage>533</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcpublichealth.biomedcentral.com/articles/10.1186/s12889-015-1861-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12889-015-1861-8</pub-id>
          <pub-id pub-id-type="medline">26041469</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12889-015-1861-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC4455277</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kawai</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gebremeskel</surname>
              <given-names>BG</given-names>
            </name>
            <name name-style="western">
              <surname>Acosta</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Systematic review of incidence and complications of herpes zoster: towards a global perspective</article-title>
          <source>BMJ Open</source>
          <year>2014</year>
          <month>06</month>
          <day>10</day>
          <volume>4</volume>
          <issue>6</issue>
          <fpage>e004833</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://bmjopen.bmj.com/cgi/pmidlookup?view=long&#38;pmid=24916088"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2014-004833</pub-id>
          <pub-id pub-id-type="medline">24916088</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2014-004833</pub-id>
          <pub-id pub-id-type="pmcid">PMC4067812</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pierik</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Gumbs</surname>
              <given-names>PD</given-names>
            </name>
            <name name-style="western">
              <surname>Fortanier</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Van Steenwijk</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Postma</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>Epidemiological characteristics and societal burden of varicella zoster virus in the Netherlands</article-title>
          <source>BMC Infect Dis</source>
          <year>2012</year>
          <month>05</month>
          <day>10</day>
          <volume>12</volume>
          <fpage>110</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcinfectdis.biomedcentral.com/articles/10.1186/1471-2334-12-110"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2334-12-110</pub-id>
          <pub-id pub-id-type="medline">22574722</pub-id>
          <pub-id pub-id-type="pii">1471-2334-12-110</pub-id>
          <pub-id pub-id-type="pmcid">PMC3464966</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Brunak</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Mining electronic health records: towards better research applications and clinical care</article-title>
          <source>Nat Rev Genet</source>
          <year>2012</year>
          <month>05</month>
          <day>02</day>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>395</fpage>
          <lpage>405</lpage>
          <pub-id pub-id-type="doi">10.1038/nrg3208</pub-id>
          <pub-id pub-id-type="medline">22549152</pub-id>
          <pub-id pub-id-type="pii">nrg3208</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>van Blijderveen</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Sen</surname>
              <given-names>EF</given-names>
            </name>
            <name name-style="western">
              <surname>Sturkenboom</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Kors</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Improving sensitivity of machine learning methods for automated case identification from free-text electronic medical records</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2013</year>
          <month>03</month>
          <day>02</day>
          <volume>13</volume>
          <fpage>30</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-13-30"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-13-30</pub-id>
          <pub-id pub-id-type="medline">23452306</pub-id>
          <pub-id pub-id-type="pii">1472-6947-13-30</pub-id>
          <pub-id pub-id-type="pmcid">PMC3602667</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Tate</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Denaxas</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shawe-Taylor</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hemingway</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Extracting diagnoses and investigation results from unstructured text in electronic health records by semi-supervised machine learning</article-title>
          <source>PLoS One</source>
          <year>2012</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>e30412</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0030412"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0030412</pub-id>
          <pub-id pub-id-type="medline">22276193</pub-id>
          <pub-id pub-id-type="pii">PONE-D-11-16913</pub-id>
          <pub-id pub-id-type="pmcid">PMC3261909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kavuluru</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rios</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>An empirical evaluation of supervised learning approaches in assigning diagnosis codes to electronic medical records</article-title>
          <source>Artif Intell Med</source>
          <year>2015</year>
          <month>10</month>
          <volume>65</volume>
          <issue>2</issue>
          <fpage>155</fpage>
          <lpage>66</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26054428"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.artmed.2015.04.007</pub-id>
          <pub-id pub-id-type="medline">26054428</pub-id>
          <pub-id pub-id-type="pii">S0933-3657(15)00048-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC4605853</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ford</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cassell</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Extracting information from the text of electronic medical records to improve case detection: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2016</year>
          <month>09</month>
          <volume>23</volume>
          <issue>5</issue>
          <fpage>1007</fpage>
          <lpage>15</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26911811"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv180</pub-id>
          <pub-id pub-id-type="medline">26911811</pub-id>
          <pub-id pub-id-type="pii">ocv180</pub-id>
          <pub-id pub-id-type="pmcid">PMC4997034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>You</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>A machine learning-based framework to identify type 2 diabetes through electronic health records</article-title>
          <source>Int J Med Inform</source>
          <year>2017</year>
          <month>01</month>
          <volume>97</volume>
          <fpage>120</fpage>
          <lpage>127</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27919371"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2016.09.014</pub-id>
          <pub-id pub-id-type="medline">27919371</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(16)30215-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC5144921</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>P-Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>C-W</given-names>
            </name>
            <name name-style="western">
              <surname>Kaddi</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Venugopalan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>-Omic and electronic health record big data analytics for precision medicine</article-title>
          <source>IEEE Trans Biomed Eng</source>
          <year>2017</year>
          <month>02</month>
          <volume>64</volume>
          <issue>2</issue>
          <fpage>263</fpage>
          <lpage>273</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27740470"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/TBME.2016.2573285</pub-id>
          <pub-id pub-id-type="medline">27740470</pub-id>
          <pub-id pub-id-type="pmcid">PMC5859562</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Regularization paths for generalized linear models via coordinate descent</article-title>
          <source>J Stat Softw</source>
          <year>2010</year>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>22</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20808728"/>
          </comment>
          <pub-id pub-id-type="medline">20808728</pub-id>
          <pub-id pub-id-type="pmcid">PMC2929880</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Additive logistic regression: a statistical view of boosting (With discussion and a rejoinder by the authors)</article-title>
          <source>Ann Statist</source>
          <year>2000</year>
          <month>04</month>
          <volume>28</volume>
          <issue>2</issue>
          <fpage>337</fpage>
          <lpage>407</lpage>
          <pub-id pub-id-type="doi">10.1214/aos/1016218223</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Arlinghaus</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chakravarthy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bhave</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Welch</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yankeelov</surname>
              <given-names>TE</given-names>
            </name>
          </person-group>
          <article-title>Early prediction of the response of breast tumors to neoadjuvant chemotherapy using quantitative MRI and machine learning</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2011</year>
          <volume>2011</volume>
          <fpage>868</fpage>
          <lpage>77</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22195145"/>
          </comment>
          <pub-id pub-id-type="medline">22195145</pub-id>
          <pub-id pub-id-type="pmcid">PMC3243164</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <source>Pedianet</source>
          <access-date>2019-04-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.pedianet.it/en">http://www.pedianet.it/en</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nicolosi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sturkenboom</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mannino</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Arpinelli</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cantarutti</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Giaquinto</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>The incidence of varicella: correction of a common error</article-title>
          <source>Epidemiology</source>
          <year>2003</year>
          <month>01</month>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>99</fpage>
          <lpage>102</lpage>
          <pub-id pub-id-type="doi">10.1097/00001648-200301000-00024</pub-id>
          <pub-id pub-id-type="medline">12500056</pub-id>
          <pub-id pub-id-type="pii">00001648-200301000-00024</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nicolosi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sturkenboom</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mannino</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Arpinelli</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cantarutti</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Giaquinto</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>The incidence of varicella: correction of a common error</article-title>
          <source>Epidemiology</source>
          <year>2003</year>
          <month>01</month>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>99</fpage>
          <lpage>102</lpage>
          <pub-id pub-id-type="doi">10.1097/00001648-200301000-00024</pub-id>
          <pub-id pub-id-type="medline">12500056</pub-id>
          <pub-id pub-id-type="pii">00001648-200301000-00024</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cantarutti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Donà</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Visentin</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Borgia</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Scamarcia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cantarutti</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Peruzzi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Egan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Villa</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Giaquinto</surname>
              <given-names>C</given-names>
            </name>
            <collab>Pedianet</collab>
          </person-group>
          <article-title>Epidemiology of frequently occurring skin diseases in Italian children from 2006 to 2012: a retrospective, population-based study</article-title>
          <source>Pediatr Dermatol</source>
          <year>2015</year>
          <volume>32</volume>
          <issue>5</issue>
          <fpage>668</fpage>
          <lpage>78</lpage>
          <pub-id pub-id-type="doi">10.1111/pde.12568</pub-id>
          <pub-id pub-id-type="medline">25879514</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Donà</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mozzo</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Scamarcia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Picelli</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Villa</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cantarutti</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Giaquinto</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Community-acquired rotavirus gastroenteritis compared with adenovirus and norovirus gastroenteritis in Italian children: a Pedianet study</article-title>
          <source>Int J Pediatr</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>5236243</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2016/5236243"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2016/5236243</pub-id>
          <pub-id pub-id-type="medline">26884770</pub-id>
          <pub-id pub-id-type="pmcid">PMC4738938</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sebastiani</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Machine learning in automated text categorization</article-title>
          <source>ACM Comput Surv</source>
          <year>2002</year>
          <month>03</month>
          <volume>34</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>47</lpage>
          <pub-id pub-id-type="doi">10.1145/505282.505283</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Spirling</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Text preprocessing for unsupervised learning: why it matters, when it misleads, and what to do about it</article-title>
          <source>Polit Anal</source>
          <year>2018</year>
          <month>03</month>
          <day>19</day>
          <volume>26</volume>
          <issue>2</issue>
          <fpage>168</fpage>
          <lpage>189</lpage>
          <pub-id pub-id-type="doi">10.1017/pan.2017.44</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Role of text mining in early identification of potential drug safety issues</article-title>
          <source>Methods Mol Biol</source>
          <year>2014</year>
          <volume>1159</volume>
          <fpage>227</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.1007/978-1-4939-0709-0_13</pub-id>
          <pub-id pub-id-type="medline">24788270</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marafino</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Davies</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bardach</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>N-gram support vector machines for scalable procedure and diagnosis classification, with applications to clinical free text data from the intensive care unit</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2014</year>
          <volume>21</volume>
          <issue>5</issue>
          <fpage>871</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24786209"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2014-002694</pub-id>
          <pub-id pub-id-type="medline">24786209</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2014-002694</pub-id>
          <pub-id pub-id-type="pmcid">PMC4147615</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gregori</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Paola</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Soriani</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Baldi</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Lanera</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Maximizing text mining performance: the impact of pre-processing</article-title>
          <source>JSM Proceedings, Section on Statistical Learning and Data Science</source>
          <year>2016</year>
          <conf-name>ASA Joint Statistical Meeting</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>Chicago, IL</conf-loc>
          <fpage>3265</fpage>
          <lpage>3270</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>HC</given-names>
            </name>
            <name name-style="western">
              <surname>Luk</surname>
              <given-names>RWP</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>KF</given-names>
            </name>
            <name name-style="western">
              <surname>Kwok</surname>
              <given-names>KL</given-names>
            </name>
          </person-group>
          <article-title>Interpreting TF-IDF term weights as making relevance decisions</article-title>
          <source>ACM Trans Inf Syst</source>
          <year>2008</year>
          <month>06</month>
          <day>01</day>
          <volume>26</volume>
          <issue>3</issue>
          <fpage>1</fpage>
          <lpage>37</lpage>
          <pub-id pub-id-type="doi">10.1145/1361684.1361686</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodall</surname>
              <given-names>CR</given-names>
            </name>
          </person-group>
          <article-title>Data mining of massive datasets in healthcare</article-title>
          <source>Journal of Computational and Graphical Statistics</source>
          <year>1999</year>
          <month>09</month>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>620</fpage>
          <lpage>634</lpage>
          <pub-id pub-id-type="doi">10.1080/10618600.1999.10474837</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jurka</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>maxent: an R package for low-memory multinomial logistic regression with support for semi-automated text classification</article-title>
          <source>The R Journal</source>
          <year>2012</year>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>56</fpage>
          <pub-id pub-id-type="doi">10.32614/rj-2012-007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Renner</surname>
              <given-names>IW</given-names>
            </name>
            <name name-style="western">
              <surname>Warton</surname>
              <given-names>DI</given-names>
            </name>
          </person-group>
          <article-title>Equivalence of MAXENT and Poisson point process models for species distribution modeling in ecology</article-title>
          <source>Biometrics</source>
          <year>2013</year>
          <month>03</month>
          <volume>69</volume>
          <issue>1</issue>
          <fpage>274</fpage>
          <lpage>81</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1541-0420.2012.01824.x</pub-id>
          <pub-id pub-id-type="medline">23379623</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tuszynski</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>R-project</source>
          <year>2019</year>
          <comment>caTools: Tools: Moving Window Statistics, GIF, Base64, ROC AUC, etc<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/package=caTools">https://cran.r-project.org/package=caTools</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dettling</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bühlmann</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Boosting for tumor classification with gene expression data</article-title>
          <source>Bioinformatics</source>
          <year>2003</year>
          <month>06</month>
          <day>12</day>
          <volume>19</volume>
          <issue>9</issue>
          <fpage>1061</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btf867</pub-id>
          <pub-id pub-id-type="medline">12801866</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Freund</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Schapire</surname>
              <given-names>RE</given-names>
            </name>
          </person-group>
          <article-title>Experiments with a new boosting algorithm</article-title>
          <year>1996</year>
          <month>07</month>
          <conf-name>Thirteenth International Conference on International Conference on Machine Learning</conf-name>
          <conf-date>1996</conf-date>
          <conf-loc>Bari, Italy</conf-loc>
          <publisher-loc>340 Pine Street, Sixth Floor, San Francisco, CA</publisher-loc>
          <publisher-name>Morgan Kaufmann Publishers Inc</publisher-name>
          <fpage>E</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cseweb.ucsd.edu/~yfreund/papers/boostingexperiments.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boughorbel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Ali</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Elkum</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Model comparison for breast cancer prognosis based on clinical data</article-title>
          <source>PLoS One</source>
          <year>2016</year>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>e0146413</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0146413"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0146413</pub-id>
          <pub-id pub-id-type="medline">26771838</pub-id>
          <pub-id pub-id-type="pii">PONE-D-15-41870</pub-id>
          <pub-id pub-id-type="pmcid">PMC4714871</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Andrews</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sleeman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Statham</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>McQuatt</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Corruble</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Howells</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Macmillan</surname>
              <given-names>CSA</given-names>
            </name>
          </person-group>
          <article-title>Predicting recovery in patients suffering from traumatic brain injury by using admission variables and physiological data: a comparison between decision tree analysis and logistic regression</article-title>
          <source>J Neurosurg</source>
          <year>2002</year>
          <month>08</month>
          <volume>97</volume>
          <issue>2</issue>
          <fpage>326</fpage>
          <lpage>36</lpage>
          <pub-id pub-id-type="doi">10.3171/jns.2002.97.2.0326</pub-id>
          <pub-id pub-id-type="medline">12186460</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Landwehr</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Frank</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Logistic model trees</article-title>
          <source>Mach Learn</source>
          <year>2005</year>
          <month>5</month>
          <volume>59</volume>
          <issue>1-2</issue>
          <fpage>161</fpage>
          <lpage>205</lpage>
          <pub-id pub-id-type="doi">10.1007/s10994-005-0466-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abeare</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>LSU Master's Theses</source>
          <year>2009</year>
          <access-date>2020-04-01</access-date>
          <comment>Comparisons of boosted regression tree, GLM and GAM performance in the standardization of yellowfin tuna catch-rate data from the Gulf of Mexico lonline [sic] fishery<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://digitalcommons.lsu.edu/gradschool_theses/2880/">https://digitalcommons.lsu.edu/gradschool_theses/2880/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>The Elements Of Statistical Learning</source>
          <year>2009</year>
          <publisher-loc>Berlin, Germany</publisher-loc>
          <publisher-name>Springer Science &#38; Business Media</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Di Ciaccio</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Measuring the prediction error. A comparison of cross-validation, bootstrap and covariance penalty methods</article-title>
          <source>Computational Statistics &#38; Data Analysis</source>
          <year>2010</year>
          <month>12</month>
          <volume>54</volume>
          <issue>12</issue>
          <fpage>2976</fpage>
          <lpage>2989</lpage>
          <pub-id pub-id-type="doi">10.1016/j.csda.2010.03.004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gwet</surname>
              <given-names>Kl</given-names>
            </name>
          </person-group>
          <source>Handbook Of Inter-rater Reliability: The Definitive Guide To Measuring The Extent Of Agreement Among Raters</source>
          <year>2014</year>
          <publisher-loc>Piedmont, Ca</publisher-loc>
          <publisher-name>Advanced Analytics, Llc</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wongpakaran</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wongpakaran</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wedding</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gwet</surname>
              <given-names>KL</given-names>
            </name>
          </person-group>
          <article-title>A comparison of Cohen's Kappa and Gwet's AC1 when calculating inter-rater reliability coefficients: a study conducted with personality disorder samples</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2013</year>
          <month>04</month>
          <day>29</day>
          <volume>13</volume>
          <fpage>61</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/1471-2288-13-61"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2288-13-61</pub-id>
          <pub-id pub-id-type="medline">23627889</pub-id>
          <pub-id pub-id-type="pii">1471-2288-13-61</pub-id>
          <pub-id pub-id-type="pmcid">PMC3643869</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zec</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Soriani</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Comoretto</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Baldi</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>High agreement and high prevalence: the paradox of Cohen's kappa</article-title>
          <source>Open Nurs J</source>
          <year>2017</year>
          <volume>11</volume>
          <fpage>211</fpage>
          <lpage>218</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29238424"/>
          </comment>
          <pub-id pub-id-type="doi">10.2174/1874434601711010211</pub-id>
          <pub-id pub-id-type="medline">29238424</pub-id>
          <pub-id pub-id-type="pii">TONURSJ-11-211</pub-id>
          <pub-id pub-id-type="pmcid">PMC5712640</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <source>R Foundation for Statistical Computing</source>
          <year>2016</year>
          <access-date>2020-04-01</access-date>
          <comment>R: A Language Environment for Statistical Computing<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.r-project.org/">https://www.r-project.org/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <source>GitHub</source>
          <comment>mltzostercode<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/UBESP-DCTV/mltzostercode">https://github.com/UBESP-DCTV/mltzostercode</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ohno-Machado</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>"Big data" and the electronic health record</article-title>
          <source>Yearb Med Inform</source>
          <year>2014</year>
          <month>08</month>
          <day>15</day>
          <volume>9</volume>
          <fpage>97</fpage>
          <lpage>104</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thieme-connect.com/DOI/DOI?10.15265/IY-2014-0003"/>
          </comment>
          <pub-id pub-id-type="doi">10.15265/IY-2014-0003</pub-id>
          <pub-id pub-id-type="medline">25123728</pub-id>
          <pub-id pub-id-type="pii">me2014-0003</pub-id>
          <pub-id pub-id-type="pmcid">PMC4287068</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wiens</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shenoy</surname>
              <given-names>ES</given-names>
            </name>
          </person-group>
          <article-title>Machine learning for healthcare: on the verge of a major shift in healthcare epidemiology</article-title>
          <source>Clin Infect Dis</source>
          <year>2018</year>
          <month>01</month>
          <day>06</day>
          <volume>66</volume>
          <issue>1</issue>
          <fpage>149</fpage>
          <lpage>153</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29020316"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/cid/cix731</pub-id>
          <pub-id pub-id-type="medline">29020316</pub-id>
          <pub-id pub-id-type="pii">4085880</pub-id>
          <pub-id pub-id-type="pmcid">PMC5850539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Geng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Logistic boosting regression for label distribution learning</article-title>
          <year>2016</year>
          <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>Las Vegas, NV</conf-loc>
          <fpage>4489</fpage>
          <lpage>4497</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://openaccess.thecvf.com/content_cvpr_2016/papers/Xing_Logistic_Boosting_Regression_CVPR_2016_paper.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/CVPR.2016.486</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lorenzoni</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Bressan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lanera</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Azzolina</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Da Dalt</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gregori</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Analysis of unstructured text-based data using machine learning techniques: the case of pediatric emergency department records in Nicaragua</article-title>
          <source>Med Care Res Rev</source>
          <year>2019</year>
          <month>04</month>
          <day>29</day>
          <conf-name>APHA 2017 Annual Meeting &#38; Expo</conf-name>
          <conf-date>November 4-8</conf-date>
          <conf-loc>Atlanta, GA</conf-loc>
          <fpage>1077558719844123</fpage>
          <pub-id pub-id-type="doi">10.1177/1077558719844123</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
