<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i2e12109</article-id>
      <article-id pub-id-type="pmid">31066686</article-id>
      <article-id pub-id-type="doi">10.2196/12109</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Natural Language Processing for the Identification of Silent Brain Infarcts From Neuroimaging Reports</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zheng</surname>
            <given-names>Jiaping</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Vydiswaran</surname>
            <given-names>Vinod</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Oram</surname>
            <given-names>Daniel</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="author" id="contrib1">
          <name name-style="western">
            <surname>Fu</surname>
            <given-names>Sunyang</given-names>
          </name>
          <degrees>MHI</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-1691-5179</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib2">
          <name name-style="western">
            <surname>Leung</surname>
            <given-names>Lester Y</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-5027-7740</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib3">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Yanshan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4433-7839</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib4">
          <name name-style="western">
            <surname>Raulli</surname>
            <given-names>Anne-Olivia</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-5273-0430</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib5">
          <name name-style="western">
            <surname>Kallmes</surname>
            <given-names>David F</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-8495-0040</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib6">
          <name name-style="western">
            <surname>Kinsman</surname>
            <given-names>Kristin A</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-7704-4114</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib7">
          <name name-style="western">
            <surname>Nelson</surname>
            <given-names>Kristoff B</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-3506-9847</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib8">
          <name name-style="western">
            <surname>Clark</surname>
            <given-names>Michael S</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-3287-8478</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib9">
          <name name-style="western">
            <surname>Luetmer</surname>
            <given-names>Patrick H</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4660-7644</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib10">
          <name name-style="western">
            <surname>Kingsbury</surname>
            <given-names>Paul R</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-1835-466X</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib11">
          <name name-style="western">
            <surname>Kent</surname>
            <given-names>David M</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-9205-5070</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib12" corresp="yes">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Hongfang</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Health Sciences Research</institution>
            <institution>Mayo Clinic</institution>
            <addr-line>205 3rd Ave SW</addr-line>
            <addr-line>Rochester, MN,</addr-line>
            <country>United States</country>
            <phone>1 5077730057</phone>
            <email>Liu.Hongfang@mayo.edu</email>
          </address>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-2570-3741</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
      <label>1</label>
      <institution>Department of Health Sciences Research</institution>
      <institution>Mayo Clinic</institution>  
      <addr-line>Rochester, MN</addr-line>
      <country>United States</country></aff>
      <aff id="aff2">
      <label>2</label>
      <institution>Department of Neurology</institution>
      <institution>Tufts Medical Center</institution>  
      <addr-line>Boston, MA</addr-line>
      <country>United States</country></aff>
      <aff id="aff3">
      <label>3</label>
      <institution>Department of Radiology</institution>
      <institution>Mayo Clinic</institution>  
      <addr-line>Rochester, MN</addr-line>
      <country>United States</country></aff>
      <aff id="aff4">
      <label>4</label>
      <institution>Institute for Clinical Research and Health Policy Studies</institution>
      <institution>Tufts Medical Center</institution>  
      <addr-line>Boston, MA</addr-line>
      <country>United States</country></aff>
      <author-notes>
        <corresp>Corresponding Author: Hongfang Liu 
        <email>Liu.Hongfang@mayo.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Apr-Jun</season>
        <year>2019</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>21</day>
        <month>4</month>
        <year>2019</year>
      </pub-date>
      <volume>7</volume>
      <issue>2</issue>
      <elocation-id>e12109</elocation-id>
      <!--history from ojs - api-xml-->
      <history>
        <date date-type="received">
          <day>5</day>
          <month>9</month>
          <year>2018</year>
        </date>
        <date date-type="rev-request">
          <day>7</day>
          <month>1</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>26</day>
          <month>2</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>30</day>
          <month>3</month>
          <year>2019</year>
        </date>
      </history>
      <copyright-statement>©Sunyang Fu, Lester Y Leung, Yanshan Wang, Anne-Olivia Raulli, David F Kallmes, Kristin A Kinsman, Kristoff B Nelson, Michael S Clark, Patrick H Luetmer, Paul R Kingsbury, David M Kent, Hongfang Liu. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 21.04.2019.</copyright-statement>
      <copyright-year>2019</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2019/2/e12109/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Silent brain infarction (SBI) is defined as the presence of 1 or more brain lesions, presumed to be because of vascular occlusion, found by neuroimaging (magnetic resonance imaging or computed tomography) in patients without clinical manifestations of stroke. It is more common than stroke and can be detected in 20% of healthy elderly people. Early detection of SBI may mitigate the risk of stroke by offering preventative treatment plans. Natural language processing (NLP) techniques offer an opportunity to systematically identify SBI cases from electronic health records (EHRs) by extracting, normalizing, and classifying SBI-related incidental findings interpreted by radiologists from neuroimaging reports.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to develop NLP systems to determine individuals with incidentally discovered SBIs from neuroimaging reports at 2 sites: Mayo Clinic and Tufts Medical Center.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Both rule-based and machine learning approaches were adopted in developing the NLP system. The rule-based system was implemented using the open source NLP pipeline MedTagger, developed by Mayo Clinic. Features for rule-based systems, including significant words and patterns related to SBI, were generated using pointwise mutual information. The machine learning models adopted convolutional neural network (CNN), random forest, support vector machine, and logistic regression. The performance of the NLP algorithm was compared with a manually created gold standard. The gold standard dataset includes 1000 radiology reports
randomly retrieved from the 2 study sites (Mayo and Tufts) corresponding to patients with no prior or current diagnosis of stroke or dementia. 400 out of the 1000 reports were randomly sampled and double read to determine interannotator agreements. The gold standard dataset was equally split to 3 subsets for training, developing, and testing. </p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Among the 400 reports selected to determine interannotator agreement, 5 reports were removed due to invalid scan types. The interannotator agreements across Mayo and Tufts neuroimaging reports were 0.87 and 0.91, respectively. The rule-based system yielded the best performance of predicting SBI with an accuracy, sensitivity, specificity, positive predictive value (PPV), and negative predictive value (NPV) of 0.991, 0.925, 1.000, 1.000, and 0.990, respectively. The CNN achieved the best score on predicting white matter disease (WMD) with an accuracy, sensitivity, specificity, PPV, and NPV of 0.994, 0.994, 0.994, 0.994, and 0.994, respectively.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We adopted a standardized data abstraction and modeling process to developed NLP techniques (rule-based and machine learning) to detect incidental SBIs and WMDs from annotated neuroimaging reports. Validation statistics suggested a high feasibility of detecting SBIs and WMDs from EHRs using NLP.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>neuroimaging</kwd>
        <kwd>electronic health records</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Silent brain infarction (SBI) is defined as the presence of 1 or more brain lesions, presumed to be because of vascular occlusion, found by neuroimaging (magnetic resonance imaging, MRI or computed tomography, CT) in patients without clinical manifestations of stroke. SBIs are more common than stroke and can be detected on MRI in 20% of healthy elderly [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Studies have shown that SBIs are associated with increased risk of subsequent stroke, cognitive decline, and deficiency in physical function [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Despite the high prevalence and serious consequences, there is no consensus on the management of SBI as routinely discovering SBIs is challenged by the absence of corresponding diagnosis codes and the lack of the knowledge about the characteristics of the affected population, treatment patterns, or the effectiveness of therapy [<xref ref-type="bibr" rid="ref1">1</xref>]. Even though there is strong evidence shows that antiplatelet and statin therapies are effective in preventing recurrent stroke in patients with prior stroke, the degree to which these results might apply to patients with SBI is unclear. Although SBI is understood by some clinicians to be pathophysiologically identical to stroke (and thus similarly treated), others view SBI as an incidental neuroimaging finding of unclear significance. The American Heart Association/American Stroke Association has identified SBI as a major priority for new studies on stroke prevention because the population affected by SBI falls between primary and secondary stroke prevention [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        <p>In addition to SBI, white matter disease (WMD) or leukoaraiosis is another common finding in neuroimaging of elderly. Similar to SBI, WMD is usually detected incidentally on brain scans and is commonly believed to be a form of microvascular ischemic brain damage resulting from typical cardiovascular risk factors [<xref ref-type="bibr" rid="ref5">5</xref>]. WMD is associated with subcortical infarcts due to small vessel disease and is predictive of functional disability, recurrent stroke, and dementia [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. SBI and WMD are related, but it is unclear whether they result from the same, independent, or synergistic processes [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. As with SBI, there are no proven preventive treatments or guidelines regarding the initiation of risk factor–modifying therapies when WMD is discovered.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>Identifying patients with SBI is challenged by the absence of corresponding diagnosis codes. One reason is that SBI-related incidental findings are not included in a patient’s problem list or other structured fields of electronic health records (EHRs); instead, the findings are captured in neuroimaging reports. A neuroimaging report is a type of EHR data that contains the interpretation and finding from neuroimage such as CT and MRI in unstructured text. Incidental SBIs can be detected by the review of neuroradiology reports obtained in clinical practice, typically performed manually by radiologists or neurologists. However, manually extracting information from patient narratives is time-consuming, costly, and lacks robustness and standardization [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. Natural language processing (NLP) has been leveraged to perform chart review for other medical conditions by automatically extracting important clinical concepts from unstructured text. Researchers have used NLP systems to identify clinical syndromes and biomedical concepts from clinical notes, radiology reports, and surgery operative notes [<xref ref-type="bibr" rid="ref15">15</xref>]. An increasing amount of NLP-enabled clinical research has been reported, ranging from identifying patient safety occurrences [<xref ref-type="bibr" rid="ref16">16</xref>] to facilitating pharmacogenomic studies [<xref ref-type="bibr" rid="ref17">17</xref>]. Our study focuses on developing NLP algorithms to routinely detect incidental SBIs and WMDs.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Setting</title>
        <p>This study was approved by the Mayo Clinic and Tufts Medical Center (TMC) institutional review boards. This work is part of the Effectiveness of Stroke PREvention in Silent StrOke project, which is to use NLP techniques to identify individuals with incidentally discovered SBIs from radiology reports, at 2 sites: Mayo Clinic and TMC.</p>
      </sec>
      <sec>
        <title>Gold Standard</title>
        <p>The detailed process of generating the gold standard is described in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>. The gold standard annotation guideline was developed by 2 subject matter experts: a vascular neurologist (LYL) and a neuroradiologist (PHL), and the annotation task was performed by 2 third-year residents (KAK, MSC) from Mayo and 2 first-year residents (AOR, KN) from TMC. Each report was annotated with 1 of the 3 labels for SBI (positive SBI, indeterminate SBI, or negative SBI) and one of the 3 labels for WMD (positive WMD, indeterminate WMD, or negative WMD).</p>
        <p>The gold standard dataset includes 1000 radiology reports randomly retrieved from the 2 study sites (500 from Mayo Clinic and 500 from TMC) corresponding to patients with no prior or current diagnosis of stroke or dementia. To calculate interannotator agreement (IAA), 400 out of the 1000 reports were randomly sampled and double read. The gold standard dataset was equally split to 3 subsets for training (334), developing (333), and testing (333).</p>
      </sec>
      <sec>
        <title>Experimental Methods</title>
        <p>We compared 2 NLP approaches. One was to define the task an information extraction (IE) task, where a rule-based IE system can be developed to extract SBI or WMD findings. The other was to define the task as a sentence classification task, where sentences can be classified to contain SBI or WMD findings.</p>
        <sec>
          <title>Rule-Based Information Extraction</title>
          <p>We adopted the open source NLP pipeline, MedTagger, as the infrastructure for the rule-based system implementation. MedTagger is a resource-driven, open source unstructured information management architecture–based IE framework [<xref ref-type="bibr" rid="ref18">18</xref>]. The system separates task-specific NLP knowledge engineering from the generic NLP process, which enables words and phrases containing clinical information to be directly coded by subject matter experts. The tool has been utilized in the eMERGE consortium to develop NLP-based phenotyping algorithms [<xref ref-type="bibr" rid="ref19">19</xref>]. <xref ref-type="fig" rid="figure1">Figure 1</xref> shows the process workflow. The generic NLP process includes sentence tokenization, text segmentation, and context detection. The task-specific NLP process includes the detection of concept mentions in the text using regular expressions and normalized to specific concepts. The summarization component applies heuristic rules for assigning the labels to the document.</p>
          <p>For example, the sentence “probable right old frontal lobe subcortical infarct as described above,” is processed as an SBI concept with the corresponding contextual information with status as “probable,” temporality as “present,” and experiencer as “patient.”</p>
          <p>The domain-specific NLP knowledge engineering was developed following 3 steps: (1) Prototype algorithm development, (2) Formative algorithm development using the training data, and (3) Final algorithm evaluation. We leveraged pointwise mutual information [<xref ref-type="bibr" rid="ref20">20</xref>] to identify significant words and patterns associated with each condition for prototyping the algorithm (<xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>). The algorithm was applied to the training data. False classified reports were manually reviewed by 2 domain experts (LYL, PHL). Keywords were manually curated through an iteratively refining process until all issues were resolved. The full list of concepts, keywords, modifiers, and diseases categories are listed in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Rule system process flow. SBI: silent brain infarction; WMD: white matter disease.</p>
            </caption>
            <graphic xlink:href="medinform_v7i2e12109_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <boxed-text id="box1" position="float">
            <title>Silent brain infarction (SBI) and white matter disease (WMD) risk factor and indication keywords.</title>
            <list list-type="bullet">
              <list-item>
                <p>Confirmation keywords—disease-finding SBI: infarct, infarcts, infarctions, infarction, lacune, lacunes</p>
              </list-item>
              <list-item>
                <p>Confirmation keywords—disease modifier SBI: acute, acute or subacute, recent, new, remote, old, chronic, prior, chronic foci of, benign, stable small, stable</p>
              </list-item>
              <list-item>
                <p>Confirmation keywords—disease location SBI: territorial, lacunar, cerebellar, cortical, frontal, caudate, right frontoparietal lobe, right frontal cortical, right frontal lobe, embolic, left basal ganglia lacunar, basal ganglia lacunar, left caudate and left putamen lacunar</p>
              </list-item>
              <list-item>
                <p>Confirmation keywords—disease-finding WMD: leukoaraiosis, white matter, microvascular ischemic, microvascular leukemic, microvascular degenerative</p>
              </list-item>
              <list-item>
                <p>Exclusion WMD: degenerative changes</p>
              </list-item>
            </list>
          </boxed-text>
        </sec>
        <sec>
          <title>Machine Learning</title>
          <p>The machine learning (ML) approach allows the system to automatically learn robust decision rules from labeled training data. The task was defined as a sequential sentence classification task. We adopted Kim’s convolutional neural network (CNN) [<xref ref-type="bibr" rid="ref21">21</xref>] and implemented using TensorFlow 1.1.02 [<xref ref-type="bibr" rid="ref22">22</xref>]. The model architecture, shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>, is a variation of the CNN architecture of Collobert R [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
          <p>We also adopted 3 traditional ML models—random forest [<xref ref-type="bibr" rid="ref24">24</xref>], support vector machine [<xref ref-type="bibr" rid="ref25">25</xref>] and logistic regression [<xref ref-type="bibr" rid="ref26">26</xref>]—for baseline comparison. All models used word vector as input representation, where each word from the input sentence is represented as the k-dimensional word vector. The word vector is generated from word embedding, a learned representation for text where words that have the same meaning have a similar representation. Suppose x<sub>1</sub>, x<sub>2</sub>, … , x<sub>n</sub> is the sequence of word representations in a sentence where</p>
          <disp-formula>x<sub>i</sub> = E<sub>xi</sub>, I = 1,2, …, n.</disp-formula>
          <p>Here, E<sub>xi</sub> is the word embedding representation for word x<sub>i</sub> with the dimensionality d. In our ML experiment, we used Wang’s word embedding trained from Mayo Clinic clinical notes where d=100 [<xref ref-type="bibr" rid="ref27">27</xref>]. The embedding model is the skip-gram of word2vec, an architecture proposed by Mikolov T [<xref ref-type="bibr" rid="ref28">28</xref>]. Let x<sub>i:i+k-1</sub> represent a window of size k in the sentence. Then the output sequence of the convolutional layer is</p>
          <disp-formula>con<sub>i</sub> = f(w<sub>k</sub> x<sub>i:i+k-1</sub> + b<sub>k</sub>),</disp-formula>
          <p>where f is a rectify linear unit function, w<sub>k</sub> and b<sub>k</sub> are the learning parameters. Max pooling was then performed to record the largest number from each feature map. By doing so, we obtained fixed length global features for the whole sentence, that is,</p>
          <disp-formula>m<sub>k</sub> = max<sub>1≤i≤n-k+1</sub>(con<sub>i</sub>).</disp-formula>
          <p>Then the features are fit into a fully connected layer with the output being the final feature vector O=wm<sub>k</sub> + b. Finally, a softmax function is utilized to make final classification decision, that is,</p>
          <disp-formula>p(sbi│x,θ) = e^(O<sub>sbi</sub>)/(e^(O<sub>sbi</sub>)+e^(O<sub>other</sub>)),</disp-formula>
          <p>where θ is a vector of the hyper parameters of the model, such as w<sub>k</sub>, b<sub>k</sub>, w and b.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Convolutional neural network architecture with 2 channels for an example sentence.</p>
            </caption>
            <graphic xlink:href="medinform_v7i2e12109_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>Evaluation Metric</title>
        <p>For evaluation of the quality of the annotated corpus, Cohen kappa was calculated to measure the IAA during all phases [<xref ref-type="bibr" rid="ref29">29</xref>]. As the primary objective of the study is case ascertainment, we calculated the IAA at the report level.</p>
        <p>A 2 x 2 confusion matrix was used to calculate performance score for model evaluation: positive predictive value (PPV), sensitivity, negative predictive value (NPV), specificity, and accuracy using manual annotation as the gold standard. The McNemar test was adopted to evaluate the performance difference between the rule-based and ML models [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. To have a better understanding of the potential variation between neuroimaging reports and neuroimages, we compared the model with the best performance (rule-based) with neuroimaging interpretation. A total of 12 CT images and 12 MRI images were stratified—randomly sampled from the test set. A total of 2 attending neurologists read all 24 images and assigned the SBI and WMD status. The cases with discrepancies were adjudicated by the neuroradiologist (PHL) The agreement was assessed using kappa and F-measure [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Interannotator Agreements Across Neuroimaging Reports</title>
        <p>Among the total 400 double-read reports, 5 reports were removed because of invalid scan types. The IAAs across Mayo and Tufts neuroimaging reports were 0.87 and 0.91. Overall, there is a high agreement between readers on both reports (<xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>). Age-specific prevalence of SBI and WMD is provided in <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Interreader agreement across 207 Mayo neuroimaging reports.</p>
          </caption>
          <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="230"/>
            <col width="150"/>
            <col width="140"/>
            <col width="150"/>
            <col width="140"/>
            <col width="100"/>
            <col width="90"/>
            <thead>
              <tr valign="top">
                <td>Interannotator agreement</td>
                <td colspan="2">Computed tomography (n=63)</td>
                <td colspan="2">Magnetic resonance imaging (n=144)</td>
                <td colspan="2">Total (n=207)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>% agree</td>
                <td>kappa</td>
                <td>% agree</td>
                <td>kappa</td>
                <td>% agree</td>
                <td>kappa</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Silent brain infarction</td>
                <td>98.4</td>
                <td>0.92</td>
                <td>97.2</td>
                <td>0.83</td>
                <td>97.6</td>
                <td>0.87</td>
              </tr>
              <tr valign="top">
                <td>White matter disease</td>
                <td>100.0</td>
                <td>1.00</td>
                <td>98.6</td>
                <td>0.97</td>
                <td>99.0</td>
                <td>0.98</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Interreader agreement across 188 Tufts Medical Center neuroimaging reports.</p>
          </caption>
          <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="230"/>
            <col width="150"/>
            <col width="140"/>
            <col width="150"/>
            <col width="140"/>
            <col width="100"/>
            <col width="90"/>
            <thead>
              <tr valign="top">
                <td>Interannotator agreement</td>
                <td colspan="2">Computed tomography (n=80)</td>
                <td colspan="2">Magnetic resonance imaging (108)</td>
                <td colspan="2">Total (n=188)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>% agree</td>
                <td>kappa</td>
                <td>% agree</td>
                <td>kappa</td>
                <td>% agree</td>
                <td>kappa</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Silent brain infarction</td>
                <td>98.8</td>
                <td>0.79</td>
                <td>99.1</td>
                <td>0.94</td>
                <td>99.5</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>White matter disease</td>
                <td>100.0</td>
                <td>1.00</td>
                <td>99.1</td>
                <td>0.98</td>
                <td>99.5</td>
                <td>0.99</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Natural Language Processing System Performance</title>
        <p>Overall, the rule-based system yielded the best performance of predicting SBI with an accuracy of 0.991. The CNN achieved the best score on predicting WMD (0.994). Full results are provided in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
        
        <p>According to the McNemar test, we found the difference between rule-based system and CNN on SBI is considered to be statistically significant (<italic>P</italic> value=.03). We found no statistically significant difference between the rest of the models.</p>
        <p><xref ref-type="table" rid="table4">Table 4</xref> lists the evaluation results of NLP and gold standard derived from reports against the neuroimaging interpretation for SBI and WMD. Both NLP and gold standard had moderate-high agreements with the neuroimaging interpretation, with kappa scores around .5. Our further analysis showed the practice graded findings (gold standard and NLP) achieved high precision and moderate recall scores compared with the neuroimaging interpretation. Through the confirmation with Mayo and TMC radiologists, we believed such discrepancy was because of the inconsistency in documentation standards related to clinical incidental findings, causing SBIs and WMDs underreported.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Performance on test dataset against human annotation as gold standard.</p>
          </caption>
          <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="290"/>
            <col width="120"/>
            <col width="120"/>
            <col width="150"/>
            <col width="170"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Evaluation of natural language processing, model name</td>
                <td>Sensitivity</td>
                <td>Specificity</td>
                <td>Positive predictive value</td>
                <td>Negative predictive value</td>
                <td>Accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="7"><bold>Silent brain infarction (n=333)</bold></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Rule-based system</td>
                <td>0.925</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>0.990</td>
                <td>0.991</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>CNN<sup>a</sup></td>
                <td>0.650</td>
                <td>0.993</td>
                <td>0.929</td>
                <td>0.954</td>
                <td>0.952</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Logistic regression</td>
                <td>0.775</td>
                <td>0.983</td>
                <td>0.861</td>
                <td>0.970</td>
                <td>0.958</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>SVM<sup>b</sup></td>
                <td>0.825</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>0.977</td>
                <td>0.979</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Random forest</td>
                <td>0.875</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>0.983</td>
                <td>0.986</td>
              </tr>
              <tr valign="top">
                <td colspan="7"><bold>White matter disease (n=333)</bold></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Rule-based system</td>
                <td>0.942</td>
                <td>0.909</td>
                <td>0.933</td>
                <td>0.921</td>
                <td>0.928</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>CNN</td>
                <td>0.994</td>
                <td>0.994</td>
                <td>0.994</td>
                <td>0.994</td>
                <td>0.994</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Logistic regression</td>
                <td>0.906</td>
                <td>0.865</td>
                <td>0.896</td>
                <td>0.877</td>
                <td>0.888</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>SVM</td>
                <td>0.864</td>
                <td>0.894</td>
                <td>0.917</td>
                <td>0.830</td>
                <td>0.877</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Random forest</td>
                <td>0.932</td>
                <td>0.880</td>
                <td>0.913</td>
                <td>0.906</td>
                <td>0.910</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>SVM: support vector machine.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Comparison of the neuroimaging interpretation with gold standard and natural language processing.</p>
          </caption>
          <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="430"/>
            <col width="140"/>
            <col width="140"/>
            <col width="130"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Evaluation of natural language processing against the neuroimaging interpretation</td>
                <td>F-measure</td>
                <td>kappa</td>
                <td>Precision</td>
                <td>Recall</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6"><bold>Silent brain infarction (n=24)</bold></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Gold standard</td>
                <td>0.74</td>
                <td>0.50</td>
                <td>0.92</td>
                <td>0.69</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>NLP<sup>a</sup></td>
                <td>0.74</td>
                <td>0.50</td>
                <td>0.92</td>
                <td>0.69</td>
              </tr>
              <tr valign="top">
                <td colspan="6"><bold>White matter disease (n=24)</bold></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Gold standard</td>
                <td>0.78</td>
                <td>0.56</td>
                <td>0.86</td>
                <td>0.80</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>NLP</td>
                <td>0.74</td>
                <td>0.49</td>
                <td>0.85</td>
                <td>0.73</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>NLP: natural language processing.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Machine Learning Versus Rule</title>
        <p>In summary, the rule-based system achieved the best performance of predicting SBI, and the CNN model yielded the highest score of predicting WMD. When detecting SBI, the ML models were able to achieve high specificity, NPV, and PPV but moderate sensitivity because of the small number of positive cases. Oversampling is a technique to adjust the class distribution of training data to balance the ratio between positive and negative cases [<xref ref-type="bibr" rid="ref33">33</xref>]. This technique was applied to the training data to help boost the signals of positive SBIs. The performance was slightly improved but was limited by the issue of overfitting, a situation when a model learns the training data too well. Due to that, unnecessary details and noises in the training data can create negative impact to the generalizability of the model. In our case, the Mayo reports have larger language variation (noise) because of a free style of documentation method, whereas TMC uses a template-based documentation method. According to the sublanguage analysis, Mayo had 212 unique expressions for describing no acute infarction, whereas TMC had only 12. Therefore, the model trained on oversampled data had a bias toward the expressions that only appeared in the training set. When predicting WMD, the ML model outperformed the rule-based model. The reason is because the dataset for WMD is more balanced than SBI (60% positive cases), which allows the system to equally learn from both classes (positive and negative). The overall performance on WMD is better than SBI because WMDs are often explicitly documented as important findings in the neuroimaging report.</p>
      </sec>
      <sec>
        <title>False Prediction Analysis</title>
        <p>Coreference resolution was the major challenge to the rule-based model for identifying SBIs. Coreference resolution is an NLP task to determine whether 2 mentioned concepts refer to the same real-world entity. For example, in <xref ref-type="boxed-text" rid="box2">Textbox 2</xref>, “The above findings” refers to “where there is an associated region of nonenhancing encephalomalacia and linear hemosiderin disposition.” To determine if a finding is SBI positive, the system needs to extract both concepts and detect their coreference relationship.</p>
        <boxed-text id="box2" position="float">
          <title>Example of coreference resolution.</title>
          <p>“Scattered, nonspecific T2 foci, most prominently in the left parietal white matter <italic>&lt;Concept 1&gt;where there is an associated region of nonenhancing encephalomalacia and linear hemosiderin disposition. &lt;Concept 1/&gt;</italic> Linear hemosiderin deposition overlying the right temporal lobe (series 9, image 16) as well. No abnormal enhancement today. <italic>&lt;Concept 2&gt;The above findings are nonspecific but the evolution, hemosiderin deposition, and gliosis suggest post ischemic change. &lt;Concept 2&gt;</italic>”</p>
        </boxed-text>
        <p>For the ML system, the false positives from the identification of SBIs were commonly contributed by disease locations. As the keywords <italic>foci, right occipital lobe, right parietal lobe, right subinsular region</italic>, and <italic>left frontal region</italic> often coexisted with SBI expressions, the model assigned higher weights to these concepts when the model was trained. For example, the expression: “there are a bilateral intraparenchymal foci of susceptibility artifact in the right occipital lobe, right parietal lobe, right subinsular region and left frontal region” has 4 locations with no mention of “infarction” appearing in the sentence. The ML system still predicted it as SBI positive. Among all ML models, the CNN yielded the worse NPV, which suggested the CNN was more likely to receive false signals from disease locations. Our next step is to further refine the system by increasing the volume of training size through leveraging distant supervision to obtain additional SBI positive cases.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our study has several limitations. First, despite the high feasibility of detecting SBIs from neuroimaging reports, there is a variation between NLP-labeled neuroimaging reports and neuroimages. Second, the performances of the ML models are limited by the number of annotated datasets. Additional training data are required to have a comprehensive comparison between the rule-based and ML systems. Third, the systems were only evaluated using datasets from 2 sites; the generalizability of the systems may be limited.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We adopted a standardized data abstraction and modeling process to developed NLP techniques (rule-based and ML) to detect incidental SBIs and WMDs from annotated neuroimaging reports. Validation statistics suggested a high feasibility of detecting SBIs and WMDs from EHRs using NLP.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>Gold standard development.</p>
        <media xlink:href="medinform_v7i2e12109_app1.pdf" xlink:title="PDF File (Adobe PDF File), 221KB"/>
      </app>
      <app id="app2">
        <title>Multimedia Appendix 2</title>
        <p>Supplementary result.</p>
        <media xlink:href="medinform_v7i2e12109_app2.pdf" xlink:title="PDF File (Adobe PDF File), 465KB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CT</term>
          <def>
            <p>computed tomography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">IAA</term>
          <def>
            <p>interannotator agreement</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">IE</term>
          <def>
            <p>information extraction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MRI</term>
          <def>
            <p>magnetic resonance imaging</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">NPV</term>
          <def>
            <p>negative predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">PPV</term>
          <def>
            <p>positive predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">SBI</term>
          <def>
            <p>silent brain infarction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">TMC</term>
          <def>
            <p>Tufts Medical Center</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">WMD</term>
          <def>
            <p>white matter disease</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to gratefully acknowledge National Institutes of Health grant 1R01NS102233 and Donna M Ihrke for case validation.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fanning</surname>
            <given-names>JP</given-names>
          </name>
          <name name-style="western">
            <surname>Wesley</surname>
            <given-names>AJ</given-names>
          </name>
          <name name-style="western">
            <surname>Wong</surname>
            <given-names>AA</given-names>
          </name>
          <name name-style="western">
            <surname>Fraser</surname>
            <given-names>JF</given-names>
          </name>
        </person-group>
        <article-title>Emerging spectra of silent brain infarction</article-title>
        <source>Stroke</source>  
        <year>2014</year>  
        <month>11</month>  
        <volume>45</volume>  
        <issue>11</issue>  
        <fpage>3461</fpage>  
        <lpage>71</lpage>  
        <pub-id pub-id-type="doi">10.1161/STROKEAHA.114.005919</pub-id>
        <pub-id pub-id-type="medline">25293663</pub-id>
        <pub-id pub-id-type="pii">STROKEAHA.114.005919</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fanning</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Wong</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Fraser</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>The epidemiology of silent brain infarction: a systematic review of population-based cohorts</article-title>
        <source>BMC Med</source>  
        <year>2014</year>  
        <month>07</month>  
        <day>9</day>  
        <volume>12</volume>  
        <fpage>119</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedicine.biomedcentral.com/articles/10.1186/s12916-014-0119-0"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/s12916-014-0119-0</pub-id>
        <pub-id pub-id-type="medline">25012298</pub-id>
        <pub-id pub-id-type="pii">s12916-014-0119-0</pub-id>
        <pub-id pub-id-type="pmcid">PMC4226994</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Vermeer</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Longstreth</surname>
            <given-names>JW</given-names>
          </name>
          <name name-style="western">
            <surname>Koudstaal</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Silent brain infarcts: a systematic review</article-title>
        <source>Lancet Neurol</source>  
        <year>2007</year>  
        <month>07</month>  
        <volume>6</volume>  
        <issue>7</issue>  
        <fpage>611</fpage>  
        <lpage>9</lpage>  
        <pub-id pub-id-type="doi">10.1016/S1474-4422(07)70170-9</pub-id>
        <pub-id pub-id-type="medline">17582361</pub-id>
        <pub-id pub-id-type="pii">S1474-4422(07)70170-9</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Furie</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Kasner</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Adams</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Albers</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Bush</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Fagan</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Halperin</surname>
            <given-names>JL</given-names>
          </name>
          <name name-style="western">
            <surname>Johnston</surname>
            <given-names>SC</given-names>
          </name>
          <name name-style="western">
            <surname>Katzan</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Kernan</surname>
            <given-names>WN</given-names>
          </name>
          <name name-style="western">
            <surname>Mitchell</surname>
            <given-names>PH</given-names>
          </name>
          <name name-style="western">
            <surname>Ovbiagele</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Palesch</surname>
            <given-names>YY</given-names>
          </name>
          <name name-style="western">
            <surname>Sacco</surname>
            <given-names>RL</given-names>
          </name>
          <name name-style="western">
            <surname>Schwamm</surname>
            <given-names>LH</given-names>
          </name>
          <name name-style="western">
            <surname>Wassertheil-Smoller</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Turan</surname>
            <given-names>TN</given-names>
          </name>
          <name name-style="western">
            <surname>Wentworth</surname>
            <given-names>D</given-names>
          </name>
          <collab>American Heart Association Stroke Council‚ Council on Cardiovascular Nursing‚ Council on Clinical Cardiology‚Interdisciplinary Council on Quality of CareOutcomes Research</collab>
        </person-group>
        <article-title>Guidelines for the prevention of stroke in patients with stroke or transient ischemic attack: a guideline for healthcare professionals from the american heart association/american stroke association</article-title>
        <source>Stroke</source>  
        <year>2011</year>  
        <month>01</month>  
        <volume>42</volume>  
        <issue>1</issue>  
        <fpage>227</fpage>  
        <lpage>76</lpage>  
        <pub-id pub-id-type="doi">10.1161/STR.0b013e3181f7d043</pub-id>
        <pub-id pub-id-type="medline">20966421</pub-id>
        <pub-id pub-id-type="pii">STR.0b013e3181f7d043</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Gouw</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>van der Flier</surname>
            <given-names>WM</given-names>
          </name>
          <name name-style="western">
            <surname>Fazekas</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>van Straaten</surname>
            <given-names>EC</given-names>
          </name>
          <name name-style="western">
            <surname>Pantoni</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Poggesi</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Inzitari</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Erkinjuntti</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Wahlund</surname>
            <given-names>LO</given-names>
          </name>
          <name name-style="western">
            <surname>Waldemar</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Schmidt</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Scheltens</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Barkhof</surname>
            <given-names>F</given-names>
          </name>
          <collab>LADIS Study Group</collab>
        </person-group>
        <article-title>Progression of white matter hyperintensities and incidence of new lacunes over a 3-year period: the Leukoaraiosis and Disability study</article-title>
        <source>Stroke</source>  
        <year>2008</year>  
        <month>05</month>  
        <volume>39</volume>  
        <issue>5</issue>  
        <fpage>1414</fpage>  
        <lpage>20</lpage>  
        <pub-id pub-id-type="doi">10.1161/STROKEAHA.107.498535</pub-id>
        <pub-id pub-id-type="medline">18323505</pub-id>
        <pub-id pub-id-type="pii">STROKEAHA.107.498535</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hijdra</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Verbeeten</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Verhulst</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Relation of leukoaraiosis to lesion type in stroke patients</article-title>
        <source>Stroke</source>  
        <year>1990</year>  
        <month>06</month>  
        <volume>21</volume>  
        <issue>6</issue>  
        <fpage>890</fpage>  
        <lpage>4</lpage>  
        <pub-id pub-id-type="doi">10.1161/01.STR.21.6.890</pub-id>
        <pub-id pub-id-type="medline">2349592</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Miyao</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Takano</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Teramoto</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Takahashi</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Leukoaraiosis in relation to prognosis for patients with lacunar infarction</article-title>
        <source>Stroke</source>  
        <year>1992</year>  
        <month>10</month>  
        <volume>23</volume>  
        <issue>10</issue>  
        <fpage>1434</fpage>  
        <lpage>8</lpage>  
        <pub-id pub-id-type="doi">10.1161/01</pub-id>
        <pub-id pub-id-type="medline">1412580</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fu</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Lu</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Hong</surname>
            <given-names>Z</given-names>
          </name>
          <name name-style="western">
            <surname>Dong</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Luo</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Wong</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>Extent of white matter lesions is related to acute subcortical infarcts and predicts further stroke risk in patients with first ever ischaemic stroke</article-title>
        <source>J Neurol Neurosurg Psychiatry</source>  
        <year>2005</year>  
        <month>06</month>  
        <volume>76</volume>  
        <issue>6</issue>  
        <fpage>793</fpage>  
        <lpage>6</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://jnnp.bmj.com/cgi/pmidlookup?view=long&amp;pmid=15897500"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/jnnp.2003.032771</pub-id>
        <pub-id pub-id-type="medline">15897500</pub-id>
        <pub-id pub-id-type="pii">76/6/793</pub-id>
        <pub-id pub-id-type="pmcid">PMC1739660</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Tang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Wei</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Z</given-names>
          </name>
        </person-group>
        <article-title>Association of white matter integrity and cognitive functions in patients with subcortical silent lacunar infarcts</article-title>
        <source>Stroke</source>  
        <year>2015</year>  
        <month>04</month>  
        <volume>46</volume>  
        <issue>4</issue>  
        <fpage>1123</fpage>  
        <lpage>6</lpage>  
        <pub-id pub-id-type="doi">10.1161/STROKEAHA.115.008998</pub-id>
        <pub-id pub-id-type="medline">25737316</pub-id>
        <pub-id pub-id-type="pii">STROKEAHA.115.008998</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Conklin</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Silver</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Mikulis</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Mandell</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Are acute infarcts the cause of leukoaraiosis? Brain mapping for 16 consecutive weeks</article-title>
        <source>Ann Neurol</source>  
        <year>2014</year>  
        <month>12</month>  
        <volume>76</volume>  
        <issue>6</issue>  
        <fpage>899</fpage>  
        <lpage>904</lpage>  
        <pub-id pub-id-type="doi">10.1002/ana.24285</pub-id>
        <pub-id pub-id-type="medline">25283088</pub-id></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Xu</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Jiang</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Oetjens</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Bowton</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Ramirez</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Jeff</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Basford</surname>
            <given-names>MA</given-names>
          </name>
          <name name-style="western">
            <surname>Pulley</surname>
            <given-names>JM</given-names>
          </name>
          <name name-style="western">
            <surname>Cowan</surname>
            <given-names>JD</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Ritchie</surname>
            <given-names>MD</given-names>
          </name>
          <name name-style="western">
            <surname>Masys</surname>
            <given-names>DR</given-names>
          </name>
          <name name-style="western">
            <surname>Roden</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Crawford</surname>
            <given-names>DC</given-names>
          </name>
          <name name-style="western">
            <surname>Denny</surname>
            <given-names>JC</given-names>
          </name>
        </person-group>
        <article-title>Facilitating pharmacogenetic studies using electronic health records and natural-language processing: a case study of warfarin</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2011</year>  
        <volume>18</volume>  
        <issue>4</issue>  
        <fpage>387</fpage>  
        <lpage>91</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21672908"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000208</pub-id>
        <pub-id pub-id-type="medline">21672908</pub-id>
        <pub-id pub-id-type="pii">amiajnl-2011-000208</pub-id>
        <pub-id pub-id-type="pmcid">PMC3128409</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Grishman</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Huttunen</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Yangarber</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Information extraction for enhanced access to disease outbreak reports</article-title>
        <source>J Biomed Inform</source>  
        <year>2002</year>  
        <month>08</month>  
        <volume>35</volume>  
        <issue>4</issue>  
        <fpage>236</fpage>  
        <lpage>46</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(03)00013-3"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/S1532-0464(03)00013-3</pub-id>
        <pub-id pub-id-type="medline">12755518</pub-id>
        <pub-id pub-id-type="pii">S1532-0464(03)00013-3</pub-id></nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>South</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Shen</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Jones</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Garvin</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Samore</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Chapman</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Gundlapalli</surname>
            <given-names>AV</given-names>
          </name>
        </person-group>
        <article-title>Developing a manually annotated clinical document corpus to identify phenotypic information for inflammatory bowel disease</article-title>
        <source>Summit Transl Bioinform</source>  
        <year>2009</year>  
        <month>03</month>  
        <day>1</day>  
        <volume>2009</volume>  
        <fpage>1</fpage>  
        <lpage>32</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21347157"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/1471-2105-10-S9-S12</pub-id>
        <pub-id pub-id-type="medline">21347157</pub-id>
        <pub-id pub-id-type="pmcid">PMC3041557</pub-id></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Gilbert</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Lowenstein</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Koziol-McLain</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Barta</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Steiner</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Chart reviews in emergency medicine research: where are the methods?</article-title>
        <source>Ann Emerg Med</source>  
        <year>1996</year>  
        <month>03</month>  
        <volume>27</volume>  
        <issue>3</issue>  
        <fpage>305</fpage>  
        <lpage>8</lpage>  
        <pub-id pub-id-type="doi">10.1016/S0196-0644(96)70264-0</pub-id>
        <pub-id pub-id-type="medline">8599488</pub-id>
        <pub-id pub-id-type="pii">S0196-0644(96)70264-0</pub-id></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Rastegar-Mojarad</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Moon</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Shen</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Afzal</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Zeng</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Mehrabi</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Sohn</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Clinical information extraction applications: a literature review</article-title>
        <source>J Biomed Inform</source>  
        <year>2018</year>  
        <month>12</month>  
        <volume>77</volume>  
        <fpage>34</fpage>  
        <lpage>49</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(17)30256-3"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.jbi.2017.11.011</pub-id>
        <pub-id pub-id-type="medline">29162496</pub-id>
        <pub-id pub-id-type="pii">S1532-0464(17)30256-3</pub-id>
        <pub-id pub-id-type="pmcid">PMC5771858</pub-id></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Murff</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>FitzHenry</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Matheny</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Gentry</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Kotter</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Crimin</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Dittus</surname>
            <given-names>RS</given-names>
          </name>
          <name name-style="western">
            <surname>Rosen</surname>
            <given-names>AK</given-names>
          </name>
          <name name-style="western">
            <surname>Elkin</surname>
            <given-names>PL</given-names>
          </name>
          <name name-style="western">
            <surname>Brown</surname>
            <given-names>SH</given-names>
          </name>
          <name name-style="western">
            <surname>Speroff</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Automated identification of postoperative complications within an electronic medical record using natural language processing</article-title>
        <source>J Am Med Assoc</source>  
        <year>2011</year>  
        <month>08</month>  
        <day>24</day>  
        <volume>306</volume>  
        <issue>8</issue>  
        <fpage>848</fpage>  
        <lpage>55</lpage>  
        <pub-id pub-id-type="doi">10.1001/jama.2011.1204</pub-id>
        <pub-id pub-id-type="medline">21862746</pub-id>
        <pub-id pub-id-type="pii">306/8/848</pub-id></nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Denny</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Ritchie</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Basford</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Pulley</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bastarache</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Brown-Gentry</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Masys</surname>
            <given-names>DR</given-names>
          </name>
          <name name-style="western">
            <surname>Roden</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Crawford</surname>
            <given-names>DC</given-names>
          </name>
        </person-group>
        <article-title>PheWAS: demonstrating the feasibility of a phenome-wide scan to discover gene-disease associations</article-title>
        <source>Bioinformatics</source>  
        <year>2010</year>  
        <month>05</month>  
        <day>1</day>  
        <volume>26</volume>  
        <issue>9</issue>  
        <fpage>1205</fpage>  
        <lpage>10</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20335276"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btq126</pub-id>
        <pub-id pub-id-type="medline">20335276</pub-id>
        <pub-id pub-id-type="pii">btq126</pub-id>
        <pub-id pub-id-type="pmcid">PMC2859132</pub-id></nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ferrucci</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Lally</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>UIMA: an architectural approach to unstructured information processing in the corporate research environment</article-title>
        <source>Nat Lang Eng</source>  
        <year>1999</year>  
        <volume>10</volume>  
        <issue>3-4</issue>  
        <fpage>327</fpage>  
        <lpage>48</lpage>  
        <pub-id pub-id-type="doi">10.1017/S1351324904003523</pub-id></nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>McCarty</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Chisholm</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Chute</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Kullo</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Jarvik</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Larson</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Masys</surname>
            <given-names>DR</given-names>
          </name>
          <name name-style="western">
            <surname>Ritchie</surname>
            <given-names>MD</given-names>
          </name>
          <name name-style="western">
            <surname>Roden</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Struewing</surname>
            <given-names>JP</given-names>
          </name>
          <name name-style="western">
            <surname>Wolf</surname>
            <given-names>WA</given-names>
          </name>
          <collab>eMERGE Team</collab>
        </person-group>
        <article-title>The eMERGE Network: a consortium of biorepositories linked to electronic medical records data for conducting genomic studies</article-title>
        <source>BMC Med Genomics</source>  
        <year>2011</year>  
        <month>01</month>  
        <day>26</day>  
        <volume>4</volume>  
        <fpage>13</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedgenomics.biomedcentral.com/articles/10.1186/1755-8794-4-13"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/1755-8794-4-13</pub-id>
        <pub-id pub-id-type="medline">21269473</pub-id>
        <pub-id pub-id-type="pii">1755-8794-4-13</pub-id>
        <pub-id pub-id-type="pmcid">PMC3038887</pub-id></nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Church</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Hanks</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Word association norms, mutual information, and lexicography</article-title>
        <source>Comput Linguist</source>  
        <year>1990</year>  
        <volume>16</volume>  
        <issue>1</issue>  
        <fpage>22</fpage>  
        <lpage>9</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/J90-1003"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>Y</given-names>
          </name>
        </person-group>
        <source>Association for Computational Linguistics</source>  
        <year>2014</year>  
        <access-date>2019-04-08</access-date>
        <comment>Convolutional neural networks for sentence classification 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D14-1181.pdf">https://www.aclweb.org/anthology/D14-1181.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="77TSHIa8O"/></comment> </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Abadi</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Barham</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Z</given-names>
          </name>
          <name name-style="western">
            <surname>Davis</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Dean</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Devin</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ghemawat</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Irving</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Isard</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Kudlur</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Levenberg</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Monga</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Moore</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Murray</surname>
            <given-names>DG</given-names>
          </name>
          <name name-style="western">
            <surname>Steiner</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Tucker</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Vasudevan</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Warden</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Wicke</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>X</given-names>
          </name>
        </person-group>
        <article-title>TensorFlow: a system for large-scale machine learning</article-title>
        <source>Proceedings of the 12th USENIX conference on Operating Systems Design and Implementation</source>  
        <year>2016</year>  
        <conf-name>OSDI'16</conf-name>
        <conf-date>November 2–4, 2016</conf-date>
        <conf-loc>Savannah, GA, USA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Collobert</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Weston</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>A unified architecture for natural language processing: deep neural networks with multitask learning</article-title>
        <source>Proceedings of the 25th international conference on Machine learning</source>  
        <year>2008</year>  
        <conf-name>ICML'08</conf-name>
        <conf-date>July 5-9, 2008</conf-date>
        <conf-loc>Helsinki, Finland</conf-loc>
        <publisher-loc>A unified architecture for natural language processing</publisher-loc>
        <publisher-name>Deep neural networks with multitask learning. Proceedings of the 25th international conference on Machine learning</publisher-name>
        <pub-id pub-id-type="doi">10.1145/1390156.1390177</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Breiman</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Random forests</article-title>
        <source>Mach Learn</source>  
        <year>2001</year>  
        <volume>45</volume>  
        <issue>1</issue>  
        <fpage>5</fpage>  
        <lpage>32</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cortes</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Vapnik</surname>
            <given-names>V</given-names>
          </name>
        </person-group>
        <article-title>Support-vector networks</article-title>
        <source>Mach Learn</source>  
        <year>1995</year>  
        <volume>20</volume>  
        <issue>3</issue>  
        <fpage>273</fpage>  
        <lpage>97</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://image.diku.dk/imagecanon/material/cortes_vapnik95.pdf"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1023/A:1022627411411</pub-id></nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hosmer</surname>
            <given-names>JD</given-names>
          </name>
          <name name-style="western">
            <surname>Lemeshow</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Sturdivant</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <source>Applied Logistic Regression</source>  
        <year>2013</year>  
        <publisher-loc>Hoboken, New Jersey</publisher-loc>
        <publisher-name>John Wiley &amp; Sons</publisher-name></nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Afzal</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Rastegar-Mojarad</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Shen</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Kingsbury</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>A comparison of word embeddings for the biomedical natural language processing</article-title>
        <source>J Biomed Inform</source>  
        <year>2018</year>  
        <month>11</month>  
        <volume>87</volume>  
        <fpage>12</fpage>  
        <lpage>20</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.jbi.2018.09.008</pub-id>
        <pub-id pub-id-type="medline">30217670</pub-id>
        <pub-id pub-id-type="pii">S1532-0464(18)30182-5</pub-id></nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mikolov</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Corrado</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Dean</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>arXiv</source>  
        <year>2013</year>  
        <access-date>2019-04-08</access-date>
        <comment>Efficient estimation of word representations in vector space 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1301.3781.pdf">https://arxiv.org/pdf/1301.3781.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="77TWweBoa"/></comment> </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cohen</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>A coefficient of agreement for nominal scales</article-title>
        <source>Educ Psychol Meas</source>  
        <year>2016</year>  
        <month>07</month>  
        <day>2</day>  
        <volume>20</volume>  
        <issue>1</issue>  
        <fpage>37</fpage>  
        <lpage>46</lpage>  
        <pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>McNemar</surname>
            <given-names>Q</given-names>
          </name>
        </person-group>
        <article-title>Note on the sampling error of the difference between correlated proportions or percentages</article-title>
        <source>Psychometrika</source>  
        <year>1947</year>  
        <month>06</month>  
        <volume>12</volume>  
        <issue>2</issue>  
        <fpage>153</fpage>  
        <lpage>7</lpage>  
        <pub-id pub-id-type="doi">10.1007/BF02295996</pub-id>
        <pub-id pub-id-type="medline">20254758</pub-id></nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Dietterich</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Approximate statistical tests for comparing supervised classification learning algorithms</article-title>
        <source>Neural Comput</source>  
        <year>1998</year>  
        <volume>10</volume>  
        <issue>7</issue>  
        <fpage>1895</fpage>  
        <lpage>923</lpage>  
        <pub-id pub-id-type="doi">10.1162/089976698300017197</pub-id></nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sasaki</surname>
            <given-names>Y</given-names>
          </name>
        </person-group>
        <source>Old Dominion University</source>  
        <year>2007</year>  
        <access-date>2019-04-08</access-date>
        <comment>The truth of the F-measure 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/268185911_The_truth_of_the_F-measure">https://www.researchgate.net/publication/268185911_The_truth_of_the_F-measure</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="77TXSTW3V"/></comment> </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chawla</surname>
            <given-names>NV</given-names>
          </name>
          <name name-style="western">
            <surname>Bowyer</surname>
            <given-names>KW</given-names>
          </name>
          <name name-style="western">
            <surname>Hall</surname>
            <given-names>LO</given-names>
          </name>
          <name name-style="western">
            <surname>Kegelmeyer</surname>
            <given-names>WP</given-names>
          </name>
        </person-group>
        <source>arXiv</source>  
        <year>2002</year>  
        <access-date>2019-04-08</access-date>
        <comment>SMOTE: synthetic minority over-sampling technique 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1106.1813.pdf">https://arxiv.org/pdf/1106.1813.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="77TXcYm3e"/></comment> </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
