<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i4e14340</article-id>
      <article-id pub-id-type="pmid">31702562</article-id>
      <article-id pub-id-type="doi">10.2196/14340</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Automatic Detection of Hypoglycemic Events From the Electronic Health Record Notes of Diabetes Patients: Empirical Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Musy</surname>
            <given-names>Sarah</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Torii</surname>
            <given-names>Manabu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Jin</surname>
            <given-names>Yonghao</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2346-8768</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Fei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1816-1761</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Vimalananda</surname>
            <given-names>Varsha G</given-names>
          </name>
          <degrees>MPH, MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0738-672X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Hong</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Computer Science</institution>
            <institution>University of Massachusetts Lowell</institution>
            <addr-line>220 Pawtucket St</addr-line>
            <addr-line>Lowell, MA, 01854</addr-line>
            <country>United States</country>
            <phone>1 9789343620</phone>
            <email>Hong_Yu@uml.edu</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9263-5035</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science</institution>
        <institution>University of Massachusetts Lowell</institution>
        <addr-line>Lowell, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Healthcare Organization and Implementation Research</institution>
        <addr-line>Bedford, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Section of Endocrinology, Diabetes and Metabolism</institution>
        <institution>School of Medicine</institution>
        <institution>Boston University</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Medicine</institution>
        <institution>University of Massachusetts Medical School</institution>
        <addr-line>Worcester, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Computer Science</institution>
        <institution>University of Massachusetts Amherst</institution>
        <addr-line>Amherst, MA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hong Yu <email>Hong_Yu@uml.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Oct-Dec</season>
        <year>2019</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>8</day>
        <month>11</month>
        <year>2019</year>
      </pub-date>
      <volume>7</volume>
      <issue>4</issue>
      <elocation-id>e14340</elocation-id>
      <history>
        <date date-type="received">
          <day>12</day>
          <month>4</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>5</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>8</day>
          <month>7</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>19</day>
          <month>10</month>
          <year>2019</year>
        </date>
      </history>
      <copyright-statement>©Yonghao Jin, Fei Li, Varsha G Vimalananda, Hong Yu. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 08.11.2019.</copyright-statement>
      <copyright-year>2019</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2019/4/e14340/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Hypoglycemic events are common and potentially dangerous conditions among patients being treated for diabetes. Automatic detection of such events could improve patient care and is valuable in population studies. Electronic health records (EHRs) are valuable resources for the detection of such events.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this study, we aim to develop a deep-learning–based natural language processing (NLP) system to automatically detect hypoglycemic events from EHR notes. Our model is called the High-Performing System for Automatically Detecting Hypoglycemic Events (HYPE).</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Domain experts reviewed 500 EHR notes of diabetes patients to determine whether each sentence contained a hypoglycemic event or not. We used this annotated corpus to train and evaluate HYPE, the high-performance NLP system for hypoglycemia detection. We built and evaluated both a classical machine learning model (ie, support vector machines [SVMs]) and state-of-the-art neural network models.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We found that neural network models outperformed the SVM model. The convolutional neural network (CNN) model yielded the highest performance in a 10-fold cross-validation setting: mean precision=0.96 (SD 0.03), mean recall=0.86 (SD 0.03), and mean F1=0.91 (SD 0.03).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Despite the challenges posed by small and highly imbalanced data, our CNN-based HYPE system still achieved a high performance for hypoglycemia detection. HYPE can be used for EHR-based hypoglycemia surveillance and population studies in diabetes patients.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>convolutional neural networks</kwd>
        <kwd>hypoglycemia</kwd>
        <kwd>adverse events</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>An estimated 29.1 million Americans aged 20 years or older have diabetes mellitus [<xref ref-type="bibr" rid="ref1">1</xref>]. Current standards of care call for stringent glycemic control to prevent the complications of diabetes. Intensive drug therapy, particularly in older adults, increases the frequency of hypoglycemia, defined as blood glucose less than 70 mg/dL [<xref ref-type="bibr" rid="ref2">2</xref>]. Treatment-associated hypoglycemia is the third-most common adverse drug event in patients with diabetes mellitus. Severe hypoglycemia, requiring third-party help or with blood glucose below 54 mg/dL, is associated with seizures, coma, and death and results in about 25,000 emergency department visits and 11,000 hospitalizations annually among Medicare patients in the United States [<xref ref-type="bibr" rid="ref3">3</xref>]. In addition, mild hypoglycemia causes troublesome symptoms, such as anxiety, palpitations, and confusion, and is associated with increased mortality. A cross-sectional study of Veterans Health Administration patients with diabetes indicated that 50% of those aged 75 years or older taking insulin and/or sulfonylureas were at risk of hypoglycemia [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      <p>Electronic health records (EHRs) are important resources for documenting hypoglycemia [<xref ref-type="bibr" rid="ref3">3</xref>]. However, studies have shown that many hypoglycemic events are not represented within the structured EHR information but are described in EHR notes [<xref ref-type="bibr" rid="ref4">4</xref>]. Manual chart review could be prohibitively expensive compared to automatic methods [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Automatically extracting hypoglycemia-related information from EHR notes can be a valuable complement to structured EHR data for guiding the management of diabetes, developing high-risk alerts, monitoring the impact of quality-improvement work, and informing research on hypoglycemia prevention [<xref ref-type="bibr" rid="ref3">3</xref>]. In clinical settings, similar systems could be used to prefill structured EHR information from patient notes.</p>
      <p>However, reliably detecting hypoglycemic events in EHR notes is very challenging. First, the descriptions of hypoglycemia vary broadly across clinical notes (eg, “patient with hypoglycemia,” “she has low bs [blood sugar] level,” and “bs is in low 20”) and it is difficult to manually specify rules to accurately detect all the variations. Second, hypoglycemia, as with most adverse events, is relatively rare. Therefore, it is difficult to collect enough patient data to train a high-performing machine learning model.</p>
      <p>In this paper, we are aiming to develop a machine learning–based natural language processing (NLP) system that is able to reliably detect hypoglycemic events from EHR notes. As we are the first group to develop such a system, there are no publicly available reference datasets and baseline models for this task. We assembled an annotated dataset from 500 EHR notes, with sentences labeled as hypoglycemia related or not by experts. We trained and evaluated different sentence classification models on this dataset to find the best model architecture and hyperparameter settings for this task.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Dataset</title>
        <p>With approval from the Institutional Review Board at the University of Massachusetts Medical School, we randomly selected 500 deidentified EHR notes from among all diabetic patients who had been treated at the UMass Memorial Medical Center in 2015. Since hypoglycemia is a relatively rare event in the general population [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>], we only selected notes containing hypoglycemia code 251 from the International Classification of Diseases, Ninth Revision, Clinical Modification (ICD-9-CM): <italic>Other disorders of pancreatic internal secretion</italic>. We selected only these notes to increase the frequency of hypoglycemia occurrence and still cover most of the patterns in descriptions of hypoglycemic symptoms.</p>
        <p>For annotation, we divided each note into sentences with the natural language toolkit [<xref ref-type="bibr" rid="ref7">7</xref>]. Two domain experts annotated each sentence as containing a hypoglycemic event (<italic>Positive</italic>) or not (<italic>Negative</italic>). A sentence was annotated as <italic>Positive</italic> if it described any hypoglycemia-related diagnosis or symptoms (eg, “patient has low blood sugar level”). To measure the accuracy of the annotation, we randomly selected 50 annotated EHR notes and asked a third domain expert to review the annotations in those notes. The third domain expert agreed with all existing annotations, which reflects the high quality of our annotation.</p>
      </sec>
      <sec>
        <title>Problem Formalization</title>
        <p>We formalized the detection of hypoglycemic events as a sentence classification problem: given sentence <italic>x</italic>, our models will classify its category <italic>y</italic> as either <italic>Positive</italic> or <italic>Negative</italic>. We proposed three deep learning models to tackle the classification task, the details of which are described in the following section.</p>
      </sec>
      <sec>
        <title>Model Designs</title>
        <sec>
          <title>Deep Learning Model</title>
          <sec>
            <title>Overview</title>
            <p>Deep learning models have been widely adopted in various machine learning tasks, including computer vision [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], speech recognition [<xref ref-type="bibr" rid="ref10">10</xref>], and NLP [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. These models typically take raw data as input and apply one or more hidden layers of transformation to automatically learn the mapping between input and output. Deep learning models have already been investigated in sentence classification problems [<xref ref-type="bibr" rid="ref14">14</xref>]. In this paper, we followed Kim’s work [<xref ref-type="bibr" rid="ref14">14</xref>] by adopting a feed-forward neural network architecture (see <xref rid="figure1" ref-type="fig">Figure 1</xref>). Our model, High-Performing System for Automatically Detecting Hypoglycemic Events (HYPE), is composed of three layers: an input layer, a hidden layer, and an output layer. We investigated three kinds of hidden layers: recurrent neural network (RNN) [<xref ref-type="bibr" rid="ref15">15</xref>], convolutional neural network (CNN) [<xref ref-type="bibr" rid="ref16">16</xref>], and temporal convolutional neural network (TCN) [<xref ref-type="bibr" rid="ref17">17</xref>]. We describe the details of our system in the following sections.</p>
            <fig id="figure1" position="float">
              <label>Figure 1</label>
              <caption>
                <p>Model architecture of our High-Performing System for Automatically Detecting Hypoglycemic Events (HYPE).  The architecture can be divided into three parts: (1) an input layer computing word embeddings for each word, (2) a sentence embedding layer always generating sentence vectors of a fixed dimension regardless of the input sentence length, and (3) an output layer projecting the sentence vector onto a probability score for each class.</p>
              </caption>
              <graphic xlink:href="medinform_v7i4e14340_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
          <sec>
            <title>Input Layer</title>
            <p>Given a sentence, we first tokenized it into <italic>l</italic> words. We then represented each word by a distributed vector using an embedding resource that was pretrained using Word2Vec on a combined text corpus of PubMed and PubMed Central Open Access [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. In this work, we used 100-dimensional pretrained embeddings. For the words that were not in the pretrained embeddings, we randomly initialized them. Specifically, the input layer takes a tokenized sentence containing <italic>l</italic> words as input and outputs an <italic>ln</italic> matrix <italic>W</italic>, where the <italic>i</italic>-th row of <italic>W</italic> is the <italic>n</italic>-dimensional embedding of the <italic>i</italic>-th word in the sentence.</p>
          </sec>
          <sec>
            <title>Hidden Layer</title>
            <p>The dimension of the matrix <italic>W</italic> we get from the input layer is <italic>ln</italic>, where <italic>l</italic> is the sentence length. Therefore, <italic>W</italic> cannot be directly processed by a standard feed-forward neural network. To handle this problem, we used a hidden layer to transform <italic>W</italic> to a fixed-length vector <italic>C</italic>. In this work, we experimented with three variations: RNN, CNN, and TCN.</p>
            <p>For RNN, we used long short-term memory (LSTM) [<xref ref-type="bibr" rid="ref20">20</xref>], which is a common type of neural network for processing sequential data [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] (see <xref rid="figure2" ref-type="fig">Figure 2</xref>). Given a matrix <italic>W</italic>, we sequentially fed each row vector into the LSTM unit, along with the hidden vector generated at the previous step. We then used the hidden vector at the previous step, <italic>h<sub>l</sub></italic>, as the representation of this sentence. At the same time, we could process the sentences in both forward and reverse orders using a bidirectional version of the RNN. The final sentence vector <italic>H</italic> is the concatenation of the last vectors from both directions <italic>h<sub>l</sub></italic> and <italic>h<sub>l</sub></italic>. A formalized description and details of the RNN are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
            <p>For the CNN, we utilized a widely used architecture [<xref ref-type="bibr" rid="ref14">14</xref>] (see <xref rid="figure3" ref-type="fig">Figure 3</xref>). Specifically, we applied several filters with fixed-length windows to slide on the sentence. For the <italic>i</italic>-th filter, it generated multiple value <italic>c<sub>i</sub></italic>=[<italic>c<sub>i</sub></italic><sub>,1</sub>, <italic>c<sub>i</sub></italic><sub>,2</sub>, ..., <italic>c<sub>i</sub></italic><sub>,</sub><italic><sub>l</sub></italic><sub>-</sub><italic><sub>m</sub></italic><sub>+1</sub>], where <italic>m</italic> is the length of the window. Next, a max-over-time pooling was applied to <italic>c</italic> to produce the output value of this filter. Finally, the outputs of these filters were concatenated to form the sentence representation <italic>H</italic>. A formalized description and details of the CNN are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
            <p>For the TCN, we employed a recently proposed architecture [<xref ref-type="bibr" rid="ref17">17</xref>]. It utilized a one-dimensional fully convolutional network and a causal convolution network at the same time. In a fully convolutional network, the output layer is the same length as the input layer after the convolution operation. The causal convolution ensures that there is no leakage of information from the future to the past (ie, the output at time <italic>t</italic> is convolved only with elements from time <italic>t</italic> and earlier in the input layer). Dilated convolution and residual connections were used in each layer to help maintain a long history size and train a deep network [<xref ref-type="bibr" rid="ref23">23</xref>]. A formalized description and details of the TCN are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
            <fig id="figure2" position="float">
              <label>Figure 2</label>
              <caption>
                <p>Recurrent neural network layer with forward and backward connections. In a unidirectional setting, the backward connections (dashed lines) are absent.</p>
              </caption>
              <graphic xlink:href="medinform_v7i4e14340_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
            <fig id="figure3" position="float">
              <label>Figure 3</label>
              <caption>
                <p>Convolutional neural network layer. Each color represents a different filter with possibly different window size. The max pooling operation produces a single signal value for each filter and the sentence vector is constructed by concatenating signal values from all filters.</p>
              </caption>
              <graphic xlink:href="medinform_v7i4e14340_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
          <sec>
            <title>Output Layer</title>
            <p>The output layer predicts whether the current sentence contains a hypoglycemic event (<italic>Positive</italic>) or not (<italic>Negative</italic>), based on the hidden representation <italic>H</italic> from the previous layer. The output layer squashes the hidden representation to a two-dimensional vector (ie, matrix multiplication) and transforms it to probability scores of <italic>Positive</italic> and <italic>Negative</italic> classes (ie, computing softmax). To train our model, we used the cross-entropy loss and standard backpropagation algorithm. The models were trained for 50 epochs with early stopping (ie, the parameter settings with the best performance on the development set were chosen for evaluation on the testing set).</p>
          </sec>
          <sec>
            <title>Baseline Model</title>
            <p>We applied support vector machines (SVMs) [<xref ref-type="bibr" rid="ref24">24</xref>], commonly used learning algorithms for classification problems, as our baseline model. SVMs have been shown to outperform neural network models in some clinical applications [<xref ref-type="bibr" rid="ref25">25</xref>]. SVMs use kernels to separate data points belonging to different classes in a nonlinearly transformed space. We used the scikit-learn package, version 0.19.0 [<xref ref-type="bibr" rid="ref26">26</xref>], in Python, version 2.7 (Python Software Foundation), to implement the SVM model and performed grid search for the best hyperparameter settings, such as different kernel functions, down-sampling rate, class weights, penalty parameters, and various n-grams. Training was repeated until convergence of the cost function. We experimented with two kinds of feature vectors: word embedding and <italic>term frequency-inverse document frequency</italic> (TFIDF) matrix. With word embedding vectorization, each sentence is vectorized by the mean of its word embeddings. With TFIDF vectorization, each sentence is vectorized by a long sparse vector with the dimension equal to the vocabulary size. Each dimension of the vector is the TFIDF of the corresponding word in the sentence with respect to the training set corpus; common stop words are removed.</p>
          </sec>
        </sec>
      </sec>
      <sec>
        <title>Hyperparameter Settings of Deep Learning Models</title>
        <p>We performed a grid search for the optimal hyperparameter settings for the deep learning models using the development set (see <xref ref-type="table" rid="table1">Table 1</xref>). Overall, the final performance was not very sensitive to the hyperparameter settings. However, we observed that different choices of the learning rate could greatly affect the convergence time. Our best-performing model was trained using the Adam algorithm [<xref ref-type="bibr" rid="ref27">27</xref>] with an optimum batch size of 64 and learning rate of 5×10<sup>-5</sup>. To prevent overfitting, we added a dropout layer [<xref ref-type="bibr" rid="ref28">28</xref>] with an optimum dropout rate of 0.5 in the output layer. The dimension of the word embeddings was set to 100 and the optimum sentence vector setting was 300.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Hyperparameter settings in our model.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="200"/>
            <col width="300"/>
            <thead>
              <tr valign="top">
                <td>Hyperparameter</td>
                <td>Optimum value</td>
                <td>Search range</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Learning rate</td>
                <td>5×10<sup>-5</sup></td>
                <td>{1×10<sup>-3</sup>, 1×10<sup>-4</sup>, ..., 1×10<sup>-6</sup>}</td>
              </tr>
              <tr valign="top">
                <td>Batch size</td>
                <td>64</td>
                <td>{16, 32, 64, 128, 256}</td>
              </tr>
              <tr valign="top">
                <td>Sentence vector size</td>
                <td>300</td>
                <td>{100, 200, 300, 400, 500}</td>
              </tr>
              <tr valign="top">
                <td>Dropout rate</td>
                <td>0.5</td>
                <td>{0.1, 0.2, 0.3, ..., 0.8}</td>
              </tr>
              <tr valign="top">
                <td>Down-sampling rate</td>
                <td>0<sup>a</sup></td>
                <td>{0, 0.1, ..., 1}</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>The optimum setting had no down-sampling.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Evaluation Metrics</title>
        <p>We performed 10-fold cross-validation. The dataset was randomly split into 10 groups of 50 notes. For each fold, we used one group as the testing set and the rest made up the training set. The development set was constructed by randomly selecting 10% of the notes from the training set.</p>
        <p>We report recall, precision, and F1 scores for the performance of our models. They are all quantities between 0 and 1. Let <italic>P</italic> denote the set of the positive instances in the testing dataset and <italic>A</italic> denote the set of instances that are predicted to be positive by the model. Obviously, the set <italic>P</italic>∩<italic>A</italic> represents the set of positive instances that get correctly classified. Recall is the number of true positive instances divided by the number of positive instances in the dataset (ie, &#124;<italic>P</italic>∩<italic>A</italic>&#124;/&#124;<italic>P</italic>&#124;). Precision is the number of true positive instances divided by the number of predicted positive instances (ie,&#124;<italic>P</italic>∩<italic>A</italic>&#124;/&#124;<italic>A</italic>&#124;). However, either precision or recall is a good measure for model performance. For example, a simple model could consistently predict every instance to be positive and therefore achieve the maximum recall. On the other hand, it could reject every instance and achieve the maximum precision. The F1 score, which is defined by the harmonic mean of the recall and precision (ie, 2×[<italic>precision</italic>×<italic>recall</italic>]/[<italic>precision</italic>+<italic>recall</italic>]), is a much more objective measure and is common for comparing model performance. In our 10-fold cross-validation scheme, precision, recall, and F1 scores were calculated for each fold, and we report the means and standard deviations for all the folds.</p>
        <p>We also report the receiver operating characteristic (ROC) curve, which is created by plotting the true positive rate and false positive rate with different thresholds. However, in a highly imbalanced dataset as in this case, where only 3% of sentences are <italic>Positive</italic>, the ROC curve is not sufficient to reflect the true performances of different models because a classifier could achieve a high-performing ROC curve via bias toward the majority class [<xref ref-type="bibr" rid="ref29">29</xref>]. Thus, the precision-recall (PR) curve is used to remedy this problem. Because we used 10-fold cross-validation, every sentence in the dataset was assigned to the testing set once and thus received a decision score. The ROC and PR curves were constructed by pooling all the decision scores. We performed two-sample <italic>t</italic> tests for measuring statistical differences between different models.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Dataset</title>
        <p>After removing identical sentences from the dataset, the 500 EHR notes contained a total of 41,034 sentences (mean 82, SD 50) with 1316 (3.21%) (mean 2.6, SD 3) annotated as <italic>Positive</italic>. The average number of words per sentence was 11.2 (SD 11), with a minimum of 2 and a maximum of 318. The distribution of positive instances among notes was not particularly even, as is common in the case of adverse events. A total of 387 out of 500 notes (77.4%) contained positive instances and the maximum number of positive sentences from one note was 17. A total of 46.73% (615/1316) of positive sentences mentioned the word <italic>hypoglycemia</italic> directly and 22.11% (291/1316) mentioned keywords concerning blood sugar level; this includes quantitative lab results (eg, “BS [blood sugar] is 68”) or qualitative descriptions (eg, “blood sugar is high”). The rest of the sentences were mostly concerned with various hypoglycemic symptoms (eg, “feeling dizzy”).</p>
      </sec>
      <sec>
        <title>Comparisons Between the HYPE and the Baseline Model</title>
        <p>As shown in <xref ref-type="table" rid="table2">Table 2</xref>, all deep learning models outperformed the best baseline SVM model—with TFIDF vectorization and radial basis function kernel—in precision, recall, and F1 scores. For the RNN-based HYPE, LSTM and bidirectional long short-term memory (bi-LSTM) had similar performances. The TCN-based HYPE slightly outperformed the RNN-based HYPE and achieved a balanced precision and recall. The CNN-based HYPE performed the best and was the most time-efficient model due to the simplicity and parallelism of its architecture.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Performance of the SVM (support vector machine) baseline and HYPE (High-Performing System for Automatically Detecting Hypoglycemic Events) based on different kinds of neural networks.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="90"/>
            <col width="70"/>
            <col width="90"/>
            <col width="70"/>
            <col width="100"/>
            <col width="70"/>
            <col width="100"/>
            <col width="70"/>
            <col width="90"/>
            <col width="70"/>
            <thead>
              <tr valign="bottom">
                <td>Performance measures</td>
                <td>SVM</td>
                <td><italic>P</italic> value<sup>a</sup></td>
                <td>LSTM<sup>b</sup></td>
                <td><italic>P</italic> value</td>
                <td>Bi-LSTM<sup>c</sup></td>
                <td><italic>P</italic> value</td>
                <td>TCN<sup>d</sup></td>
                <td><italic>P</italic> value</td>
                <td>CNN<sup>e</sup></td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="bottom">
                <td>Precision, mean (SD)</td>
                <td>0.74 (0.07)</td>
                <td>&#60;.001</td>
                <td>0.91 (0.02)</td>
                <td>&#60;.001</td>
                <td>0.91 (0.02)</td>
                <td>&#60;.001</td>
                <td>0.92 (0.03)</td>
                <td>.05</td>
                <td>0.96 (0.03)</td>
                <td>N/A<sup>f</sup></td>
              </tr>
              <tr valign="bottom">
                <td>Recall, mean (SD)</td>
                <td>0.57 (0.05)</td>
                <td>&#60;.001</td>
                <td>0.86 (0.02)</td>
                <td>.02</td>
                <td>0.87 (0.04)</td>
                <td>.10</td>
                <td>0.89 (0.04)</td>
                <td>N/A</td>
                <td>0.86 (0.03)</td>
                <td>.10</td>
              </tr>
              <tr valign="bottom">
                <td>F1, mean (SD)</td>
                <td>0.64 (0.03)</td>
                <td>&#60;.001</td>
                <td>0.88 (0.02)</td>
                <td>&#60;.001</td>
                <td>0.88 (0.02)</td>
                <td>.001</td>
                <td>0.90 (0.02)</td>
                <td>.30</td>
                <td>0.91 (0.02)</td>
                <td>N/A</td>
              </tr>
              <tr valign="bottom">
                <td>PR-AUC<sup>g</sup></td>
                <td>0.745</td>
                <td>N/A</td>
                <td>0.934</td>
                <td>N/A</td>
                <td>0.942</td>
                <td>N/A</td>
                <td>0.964</td>
                <td>N/A</td>
                <td>0.966</td>
                <td>N/A</td>
              </tr>
              <tr valign="bottom">
                <td>ROC-AUC<sup>h</sup></td>
                <td>0.970</td>
                <td>N/A</td>
                <td>0.996</td>
                <td>N/A</td>
                <td>0.997</td>
                <td>N/A</td>
                <td>0.998</td>
                <td>N/A</td>
                <td>0.998</td>
                <td>N/A</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup><italic>P</italic> values are based on two-sample <italic>t</italic> tests between the performance of the system and the best-performing system; values &#60;.05 are significant.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>bi-LSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>TCN: temporal convolutional neural network.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>N/A: not applicable.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>PR-AUC: precision-recall area under the curve.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>ROC-AUC: receiver operating characteristic area under the curve.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In terms of the receiver operating characteristic area under the curve (ROC-AUC), all of our models achieved good scores (&#62;0.95) because of the highly imbalanced nature of our dataset. We also reported the precision-recall area under the curve (PR-AUC) value of each model, which is more suitable for skewed datasets [<xref ref-type="bibr" rid="ref29">29</xref>], as in our case. The ROC and PR curves show that the CNN model has the best PR curve and PR-AUC value (see <xref rid="figure4" ref-type="fig">Figure 4</xref>).</p>
      </sec>
      <sec>
        <title>Down-Sampling for Data Imbalance</title>
        <p>To address data imbalance, we experimented with down-sampling by randomly selecting a subset of the negative training examples at the start of each epoch. We used the best-performing CNN-based HYPE in the down-sampling experiments. As shown in <xref ref-type="table" rid="table3">Table 3</xref>, down-sampling increased the weight of the minority class, thus increasing the recall. However, the precision dropped because of the lack of the negative examples during training. Therefore, the overall performance decreased when using down-sampling.</p>
      </sec>
      <sec>
        <title>Influence of the Training Data Size</title>
        <p>To investigate the influence of the training data size on the model performance, we varied the number of examples in the training set. A certain percentage of training examples were randomly selected, while the development and test sets remained the same. We again used the CNN-based HYPE for these experiments. As shown in <xref ref-type="table" rid="table4">Table 4</xref>, the precision of our model was only sensitive to the training size at the very smallest level.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Precision-recall (PR) and receiver operating characteristic (ROC) curves of each model. Bi-LSTM: bidirectional long short-term memory; CNN: convolutional neural network; LSTM: long short-term memory; SVM: support vector machine; TCN: temporal convolutional neural network.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e14340_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Effect of down-sampling on convolutional neural network (CNN) model performance.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="550"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Performance measures</td>
                <td colspan="3">Ratio of positive to negative training examples, mean (SD)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1:1</td>
                <td>1:4</td>
                <td>1:9</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Precision</td>
                <td>0.46 (0.03)</td>
                <td>0.86 (0.04)</td>
                <td>0.93 (0.03)</td>
              </tr>
              <tr valign="top">
                <td>Recall</td>
                <td>0.92 (0.02)</td>
                <td>0.89 (0.03)</td>
                <td>0.88 (0.02)</td>
              </tr>
              <tr valign="top">
                <td>F1</td>
                <td>0.62 (0.03)</td>
                <td>0.87 (0.03)</td>
                <td>0.91 (0.02)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Convolutional neural network (CNN) model performance with percentage reduction in training examples.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Performance measures</td>
                <td colspan="5">Percentage reduction in training examples, mean (SD)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>5%</td>
                <td>10%</td>
                <td>20%</td>
                <td>40%</td>
                <td>80%</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Precision</td>
                <td>0.81 (0.38)</td>
                <td>0.97 (0.02)</td>
                <td>0.96 (0.02)</td>
                <td>0.96 (0.03)</td>
                <td>0.95 (0.02)</td>
              </tr>
              <tr valign="top">
                <td>Recall</td>
                <td>0.03 (0.03)</td>
                <td>0.43 (0.05)</td>
                <td>0.67 (0.03)</td>
                <td>0.77 (0.04)</td>
                <td>0.85 (0.03)</td>
              </tr>
              <tr valign="top">
                <td>F1</td>
                <td>0.05 (0.05)</td>
                <td>0.60 (0.04)</td>
                <td>0.79 (0.03)</td>
                <td>0.86 (0.03)</td>
                <td>0.90 (0.02)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>However, recall progressively deteriorated as the training size decreased. As the example size becomes smaller, the model tends to be more conservative about making positive predictions. The overall performance (F1) increases as the number of training examples increases, which is expected.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our results show that HYPE outperformed SVMs by a large margin in every evaluation metric. One major difference between HYPE and SVMs is how they represent an input sentence. SVMs use bag-of-words and n-grams to represent the input sentence as a sparse vector. In contrast, HYPE uses neural networks to convert the input sentence into a dense vector, which is able to improve the representation ability while avoiding sparsity [<xref ref-type="bibr" rid="ref18">18</xref>]. Our results also show that neural network models can successfully be trained using a relatively small and imbalanced dataset: a total of 41,034 sentences, of which 1316 sentences were positive instances. The implication is significant as the “knowledge-bottleneck” challenge has made it unrealistic to annotate a large amount of clinical data for supervised machine learning applications.</p>
      </sec>
      <sec>
        <title>Comparisons Between Different Hidden Layers of HYPE</title>
        <p>In our results, HYPE achieved good performance for detecting sentence-level hypoglycemia, even though the data were imbalanced. We also found that the commonly used approach of down-sampling did not improve performance. While CNN-based HYPE achieved the best precision (mean 0.96, SD 0.03), TCN-based HYPE achieved the best recall (mean 0.89, SD 0.04). One possible explanation for the difference in recall is that CNN is able to capture only the local contextual expressions of hypoglycemic events. TCN is a version of CNN that is equipped with residual connections and diluted convolutions; as such, TCN has the advantage of capturing information in a long context. However, CNN outperformed TCN for the overall performance. CNN also outperformed the two RNN-based models (ie, LSTM and bi-LSTM). This suggests that RNN is less effective than CNN in capturing the contextual patterns of hypoglycemic events. The performance of CNN might be further improved by adding an attention mechanism but we leave this investigation for future work. As for time efficiency, RNN-based HYPE was 10 times slower than the CNN in training. This is because we need to perform many expensive computations in the LSTM units and RNN is hard to parallelize due to its recurrent nature. Thus, CNN is more suitable for our task than RNN.</p>
      </sec>
      <sec>
        <title>Effects of Tuning Word Embeddings</title>
        <p>A common practice for NLP tasks when working with a small dataset is to fix the pretrained word embeddings during training. The rationale is that when the model encounters a word in the testing set that is not presented in the training set, the model is still able to make correct predictions because its embedding is close to a similar word presented in the training set. However, in our experiments if the embeddings were fixed, we observed a 3%-4% performance loss in F1 score. The best-performing approach was to update word embeddings through backpropagation. The reason for the performance loss of fixed pretrained embeddings might be that the vocabulary size used for describing hypoglycemic events is both small and domain specific. Pretrained embeddings allow a model to attain useful information on general words in the open domain, but fine-tuning word embeddings allows the model to learn domain-specific knowledge. An interesting example is that, if word embeddings were fixed, the model would not be able to discriminate “blood sugar is low” from “blood sugar is high.” This may be because the words “high” and “low” have similar distributions in the open domain and because their embeddings are very close to each other. If we tuned their embeddings, the model could learn that “low” and “high” have very different semantics.</p>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <p>We manually examined the error cases and identified two types of common errors. First, HYPE often failed in cases where hypoglycemic events were indicated by numerical measurements of blood sugar levels. Our model could easily identify sentences such as “BS is low” as hypoglycemic events but it often made mistakes when it encountered sentences such as “BS is 68” or “fsbs [finger stick blood sugar] noted to be 9.” Such sentences are difficult to identify for many reasons. One reason is that the word embedding we used in this work transformed numbers to zero during training in order to avoid sparsity [<xref ref-type="bibr" rid="ref18">18</xref>]. Therefore, the number value was lost in the embedding space and it was impossible for the model to learn a <italic>less than</italic> operation to identify low blood sugar value. Also, the units of the numeric value were often absent and, therefore, needed to be inferred from the context. In the above examples, “68” should be “68 mg/dL” and “9” should be “9 nmol/L.” Since such information may not be obtained from the sentence, external human knowledge along with clear definitions for hypoglycemic blood glucose values must be incorporated. In the future, we will explore effective approaches to cope with this issue.</p>
        <p>The second type of error was negated events, such as “The patient had no seizures, headaches, abdominal pain, sweating, or other adrenergic symptoms of hypoglycemia.” In this example, HYPE failed to understand the negated word “no” and identified this sentence as a hypoglycemic event. Because the number of such sentences was small, it would be difficult to solve this problem by adding additional features to capture the negation expression. Therefore, we need to incorporate additional approaches for negation identification [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>The main limitation of our study is that we selected EHR notes using only diabetes-related ICD-9-CM codes, so the scale of our dataset was relatively small and may not have reflected the true distribution of hypoglycemia sentences in real-world applications. Moreover, because HYPE focuses on sentence-level event detection, it will miss hypoglycemic events that are expressed across multiple sentences. In future work, we will explore document-level hypoglycemic event detection.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we developed and evaluated state-of-the-art machine learning models to detect hypoglycemia events from EHR notes. We explored three different deep learning models—RNN, CNN, and TCN—and found that the CNN model performed the best, achieving 96% precision and 89% recall. Our work is an important step toward automated surveillance of hypoglycemic events in EHRs and helping clinicians, health care system leaders, and researchers in their efforts to prevent hypoglycemia and to safely manage diabetes mellitus.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>A formalized description and details of the recurrent neural network (RNN), the convolutional neural network (CNN), and the temporal convolutional neural network (TCN).</p>
        <media xlink:href="medinform_v7i4e14340_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 73 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">bi-LSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">bs/BS</term>
          <def>
            <p>blood sugar</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">fsbs</term>
          <def>
            <p>finger stick blood sugar</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">HYPE</term>
          <def>
            <p>High-Performing Systems for Automatically Detecting Hypoglycemic Events</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">ICD-9-CM</term>
          <def>
            <p>International Classification of Diseases, Ninth Revision, Clinical Modification</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">PR</term>
          <def>
            <p>precision-recall</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">PR-AUC</term>
          <def>
            <p>precision-recall area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">RNN</term>
          <def>
            <p>recurrent neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">ROC</term>
          <def>
            <p>receiver operating characteristic</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">ROC-AUC</term>
          <def>
            <p>receiver operating characteristic area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">TCN</term>
          <def>
            <p>temporal convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">TFIDF</term>
          <def>
            <p>term frequency-inverse document frequency</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by a grant from the National Heart, Lung, and Blood Institute of the National Institutes of Health (grant number: R01HL125089). The content is solely the responsibility of the authors and does not represent the views of the National Institutes of Health. We would like to acknowledge the annotation team: Heather Keating, Raelene Goodwin, Brian Corner, Nadya Frid, and Feifan Liu.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>YJ, FL, and HY conceptualized and designed this study. YJ implemented the tools. YJ and FL processed the data. YJ wrote the manuscript. FL, HY, and VGV reviewed and contributed to editing the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <source>Centers for Disease Control and Prevention</source>
          <year>2011</year>
          <access-date>2019-11-03</access-date>
          <publisher-loc>Atlanta, GA</publisher-loc>
          <publisher-name>US Department of Health and Human Services, Centers for Disease Control and Prevention</publisher-name>
          <comment>National diabetes fact sheet: National estimates and general information on diabetes and prediabetes in the United States, 2011 <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/diabetes/pubs/pdf/ndfs_2011.pdf">https://www.cdc.gov/diabetes/pubs/pdf/ndfs_2011.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bell</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Yumuk</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Frequency of severe hypoglycemia in patients with non-insulin-dependent diabetes mellitus treated with sulfonylureas or insulin</article-title>
          <source>Endocr Pract</source>
          <year>1997</year>
          <volume>3</volume>
          <issue>5</issue>
          <fpage>281</fpage>
          <lpage>283</lpage>
          <pub-id pub-id-type="doi">10.4158/EP.3.5.281</pub-id>
          <pub-id pub-id-type="medline">15251781</pub-id>
          <pub-id pub-id-type="pii">YTT9B6BRG1W6M1BP</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lipska</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Inzucchi</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Minges</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Karter</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Desai</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Gill</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Krumholz</surname>
              <given-names>HM</given-names>
            </name>
          </person-group>
          <article-title>National trends in US hospital admissions for hyperglycemia and hypoglycemia among Medicare beneficiaries, 1999 to 2011</article-title>
          <source>JAMA Intern Med</source>
          <year>2014</year>
          <month>07</month>
          <volume>174</volume>
          <issue>7</issue>
          <fpage>1116</fpage>
          <lpage>1124</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24838229"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2014.1824</pub-id>
          <pub-id pub-id-type="medline">24838229</pub-id>
          <pub-id pub-id-type="pii">1871566</pub-id>
          <pub-id pub-id-type="pmcid">PMC4152370</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Workgroup on Hypoglycemia‚ American Diabetes Association</collab>
          </person-group>
          <article-title>Defining and reporting hypoglycemia in diabetes: A report from the American Diabetes Association Workgroup on Hypoglycemia</article-title>
          <source>Diabetes Care</source>
          <year>2005</year>
          <month>05</month>
          <volume>28</volume>
          <issue>5</issue>
          <fpage>1245</fpage>
          <lpage>1249</lpage>
          <pub-id pub-id-type="doi">10.2337/diacare.28.5.1245</pub-id>
          <pub-id pub-id-type="medline">15855602</pub-id>
          <pub-id pub-id-type="pii">28/5/1245</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Melton</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Moeller</surname>
              <given-names>ND</given-names>
            </name>
            <name name-style="western">
              <surname>Arsoniadis</surname>
              <given-names>EG</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kwaan</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>EH</given-names>
            </name>
            <name name-style="western">
              <surname>Simon</surname>
              <given-names>GJ</given-names>
            </name>
          </person-group>
          <article-title>Accelerating chart review using automated methods on electronic health record data for postoperative complications</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>1822</fpage>
          <lpage>1831</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28269941"/>
          </comment>
          <pub-id pub-id-type="medline">28269941</pub-id>
          <pub-id pub-id-type="pmcid">PMC5333220</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Martinez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ananda-Rajah</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Suominen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Slavin</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Thursky</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Cavedon</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Automatic detection of patients with invasive fungal disease from free-text computed tomography (CT) scans</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>02</month>
          <volume>53</volume>
          <fpage>251</fpage>
          <lpage>260</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(14)00239-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2014.11.009</pub-id>
          <pub-id pub-id-type="medline">25460203</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(14)00239-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bird</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Loper</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <source>Natural Language Processing with Python: Analyzing Text With the Natural Language Toolkit</source>
          <year>2009</year>
          <publisher-loc>Sebastopol, CA</publisher-loc>
          <publisher-name>O'Reilly Media</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krizhevsky</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>GE</given-names>
            </name>
          </person-group>
          <article-title>ImageNet classification with deep convolutional neural networks</article-title>
          <source>Commun ACM</source>
          <year>2017</year>
          <month>05</month>
          <day>24</day>
          <volume>60</volume>
          <issue>6</issue>
          <fpage>84</fpage>
          <lpage>90</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://papers.nips.cc/paper/4824-imagenet"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3065386</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Russakovsky</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Krause</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Satheesh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Karpathy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Khosla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bernstein</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Berg</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Fei-Fei</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>ImageNet large-scale visual recognition challenge</article-title>
          <source>Int J Comput Vis</source>
          <year>2015</year>
          <month>4</month>
          <day>11</day>
          <volume>115</volume>
          <issue>3</issue>
          <fpage>211</fpage>
          <lpage>252</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1409.0575"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11263-015-0816-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dahl</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jaitly</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Senior</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vanhoucke</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sainath</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kingsbury</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups</article-title>
          <source>IEEE Signal Process Mag</source>
          <year>2012</year>
          <month>11</month>
          <volume>29</volume>
          <issue>6</issue>
          <fpage>82</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.1109/msp.2012.2205597</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collobert</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Weston</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A unified architecture for natural language processing: Deep neural networks with multitask learning</article-title>
          <source>Proceedings of the 25th International Conference on Machine Learning</source>
          <year>2008</year>
          <conf-name>25th International Conference on Machine Learning</conf-name>
          <conf-date>July 5-9, 2008</conf-date>
          <conf-loc>Helsinki, Finland</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dl.acm.org/citation.cfm?id=1390177"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/1390156.1390177</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised recursive autoencoders for predicting sentiment distributions</article-title>
          <source>Proceedings of the Conference on Empirical Methods in Natural Language Processing</source>
          <year>2011</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>July 27–31, 2011</conf-date>
          <conf-loc>Edinburgh, Scotland</conf-loc>
          <fpage>151</fpage>
          <lpage>161</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D11-1014.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Parsing natural scenes and natural language with recursive neural networks</article-title>
          <source>Proceedings of the 28th International Conference on Machine Learning</source>
          <year>2011</year>
          <month>06</month>
          <day>28</day>
          <conf-name>28th International Conference on Machine Learning</conf-name>
          <conf-date>June 28-July 2, 2011</conf-date>
          <conf-loc>Bellevue, WA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://machinelearning.wustl.edu/mlpapers/paper_files/ICML2011Socher_125.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Convolutional neural networks for sentence classification</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>
          <year>2014</year>
          <conf-name>2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October 26-28, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/D14-1181</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schuster</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paliwal</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bidirectional recurrent neural networks</article-title>
          <source>IEEE Trans Signal Process</source>
          <year>1997</year>
          <month>11</month>
          <day>01</day>
          <volume>45</volume>
          <issue>11</issue>
          <fpage>2673</fpage>
          <lpage>2681</lpage>
          <pub-id pub-id-type="doi">10.1109/78.650093</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duffy</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Convolution kernels for natural language</article-title>
          <source>Proceedings of the 14th International Conference on Neural Information Processing Systems: Natural and Synthetic</source>
          <year>2001</year>
          <conf-name>14th International Conference on Neural Information Processing Systems: Natural and Synthetic</conf-name>
          <conf-date>December 3-8, 2001</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
          <fpage>625</fpage>
          <lpage>632</lpage>
          <pub-id pub-id-type="doi">10.7551/mitpress/1120.003.0085</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kolter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Koltun</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <source>arXiv</source>
          <year>2018</year>
          <month>03</month>
          <day>04</day>
          <access-date>2018-08-24</access-date>
          <comment>An empirical evaluation of generic convolutional and recurrent networks for sequence modeling<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1803.01271">http://arxiv.org/abs/1803.01271</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS'13). Volume 2</source>
          <year>2013</year>
          <conf-name>26th International Conference on Neural Information Processing Systems (NIPS'13). Volume 2</conf-name>
          <conf-date>December 5-10, 2013</conf-date>
          <conf-loc>Lake Tahoe, NV</conf-loc>
          <fpage>3111</fpage>
          <lpage>3119</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pyysalo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ginter</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Moen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Salakoski</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ananiadou</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Distributional semantics resources for biomedical text processing</article-title>
          <source>Proceedings of the 5th International Symposium on Languages in Biology and Medicine</source>
          <year>2013</year>
          <conf-name>5th International Symposium on Languages in Biology and Medicine</conf-name>
          <conf-date>December 12-13, 2013</conf-date>
          <conf-loc>Tokyo, Japan</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hochreiter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidhuber</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Long short-term memory</article-title>
          <source>Neural Comput</source>
          <year>1997</year>
          <month>11</month>
          <volume>9</volume>
          <issue>8</issue>
          <fpage>1735</fpage>
          <lpage>1780</lpage>
          <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jagannatha</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Bidirectional RNN for medical event detection in electronic health records</article-title>
          <source>Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2016</year>
          <conf-name>Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 13-15, 2016</conf-date>
          <conf-loc>San Diego, CA</conf-loc>
          <fpage>473</fpage>
          <lpage>482</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N16-1056.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/n16-1056</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jagannatha</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Structured prediction models for RNN-based sequence labeling in clinical text</article-title>
          <source>Proceedings of the Conference on Empirical Methods in Natural Language Processing</source>
          <year>2016</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>November 1-5, 2016</conf-date>
          <conf-loc>Austin, TX</conf-loc>
          <fpage>856</fpage>
          <lpage>865</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D16-1082.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d16-1082</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep residual learning for image recognition</article-title>
          <source>Proceedings of the 2016 IEEE Conference on Computer Vision and Pattern Recognition</source>
          <year>2016</year>
          <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>June 26-July 1, 2016</conf-date>
          <conf-loc>Las Vegas, NV</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1512.03385.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cortes</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Vapnik</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Support-vector networks</article-title>
          <source>Mach Learn</source>
          <year>1995</year>
          <month>9</month>
          <volume>20</volume>
          <issue>3</issue>
          <fpage>273</fpage>
          <lpage>297</lpage>
          <pub-id pub-id-type="doi">10.1007/BF00994018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Munkhdalai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Clinical relation extraction toward drug safety surveillance using electronic health record narratives: Classical learning versus deep learning</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2018</year>
          <month>04</month>
          <day>25</day>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>e29</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2018/2/e29/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/publichealth.9361</pub-id>
          <pub-id pub-id-type="medline">29695376</pub-id>
          <pub-id pub-id-type="pii">v4i2e29</pub-id>
          <pub-id pub-id-type="pmcid">PMC5943628</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nothman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Louppe</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brucher</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perrot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duchesnay</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: Machine learning in Python</article-title>
          <source>J Mach Learn Res</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>2825</fpage>
          <lpage>2830</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1201.0490.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Adam: A method for stochastic optimization</article-title>
          <source>Proceedings of the 3rd International Conference on Learning Representations</source>
          <year>2015</year>
          <conf-name>3rd International Conference on Learning Representations</conf-name>
          <conf-date>May 7-9, 2015</conf-date>
          <conf-loc>San Diego, CA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1412.6980.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Srivastava</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Krizhevsky</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Dropout: A simple way to prevent neural networks from overfitting</article-title>
          <source>J Mach Learn Res</source>
          <year>2014</year>
          <month>06</month>
          <day>14</day>
          <volume>15</volume>
          <fpage>1929</fpage>
          <lpage>1958</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://jmlr.csail.mit.edu/papers/volume15/srivastava14a/srivastava14a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Goadrich</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The relationship between precision-recall and ROC curves</article-title>
          <source>Proceedings of the 23rd International Conference on Machine Learning</source>
          <year>2006</year>
          <conf-name>23rd International Conference on Machine Learning</conf-name>
          <conf-date>June 25-29, 2006</conf-date>
          <conf-loc>Pittsburgh, PA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/1143844.1143874</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Biomedical negation scope detection with conditional random fields</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>6</issue>
          <fpage>696</fpage>
          <lpage>701</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20962133"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2010.003228</pub-id>
          <pub-id pub-id-type="medline">20962133</pub-id>
          <pub-id pub-id-type="pii">17/6/696</pub-id>
          <pub-id pub-id-type="pmcid">PMC3000754</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
