<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i4e15980</article-id>
      <article-id pub-id-type="pmid">31674914</article-id>
      <article-id pub-id-type="doi">10.2196/15980</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Cohort Selection for Clinical Trials From Longitudinal Patient Records: Text Mining Approach</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Karystianis</surname>
            <given-names>George</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Dai</surname>
            <given-names>Hong-Jie</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Basu</surname>
            <given-names>Tanmay</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zheng</surname>
            <given-names>Chengyi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Pollettini</surname>
            <given-names>Juliana T</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wang</surname>
            <given-names>Yanshan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Spasic</surname>
            <given-names>Irena</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Computer Science &#38; Informatics</institution>
            <institution>Cardiff University</institution>
            <addr-line>5 The Parade</addr-line>
            <addr-line>Cardiff, CF24 3AA</addr-line>
            <country>United Kingdom</country>
            <phone>44 02920870320</phone>
            <email>spasici@cardiff.ac.uk</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8132-3885</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Krzeminski</surname>
            <given-names>Dominik</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4568-0583</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Corcoran</surname>
            <given-names>Padraig</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9731-3385</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Balinsky</surname>
            <given-names>Alexander</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8151-4462</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Computer Science &#38; Informatics</institution>
        <institution>Cardiff University</institution>
        <addr-line>Cardiff</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>School of Psychology</institution>
        <institution>Cardiff University</institution>
        <addr-line>Cardiff</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>School of Mathematics</institution>
        <institution>Cardiff University</institution>
        <addr-line>Cardiff</addr-line>
        <country>United Kingdom</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Irena Spasic <email>spasici@cardiff.ac.uk</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Oct-Dec</season>
        <year>2019</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>31</day>
        <month>10</month>
        <year>2019</year>
      </pub-date>
      <volume>7</volume>
      <issue>4</issue>
      <elocation-id>e15980</elocation-id>
      <history>
        <date date-type="received">
          <day>28</day>
          <month>8</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>19</day>
          <month>9</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>29</day>
          <month>9</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>2</day>
          <month>10</month>
          <year>2019</year>
        </date>
      </history>
      <copyright-statement>©Irena Spasic, Dominik Krzeminski, Padraig Corcoran, Alexander Balinsky. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 31.10.2019.</copyright-statement>
      <copyright-year>2019</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2019/4/e15980/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Clinical trials are an important step in introducing new interventions into clinical practice by generating data on their safety and efficacy. Clinical trials need to ensure that participants are similar so that the findings can be attributed to the interventions studied and not to some other factors. Therefore, each clinical trial defines eligibility criteria, which describe characteristics that must be shared by the participants. Unfortunately, the complexities of eligibility criteria may not allow them to be translated directly into readily executable database queries. Instead, they may require careful analysis of the narrative sections of medical records. Manual screening of medical records is time consuming, thus negatively affecting the timeliness of the recruitment process.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Track 1 of the 2018 National Natural Language Processing Clinical Challenge focused on the task of cohort selection for clinical trials, aiming to answer the following question: Can natural language processing be applied to narrative medical records to identify patients who meet eligibility criteria for clinical trials? The task required the participating systems to analyze longitudinal patient records to determine if the corresponding patients met the given eligibility criteria. We aimed to describe a system developed to address this task.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Our system consisted of 13 classifiers, one for each eligibility criterion. All classifiers used a bag-of-words document representation model. To prevent the loss of relevant contextual information associated with such representation, a pattern-matching approach was used to extract context-sensitive features. They were embedded back into the text as lexically distinguishable tokens, which were consequently featured in the bag-of-words representation. Supervised machine learning was chosen wherever a sufficient number of both positive and negative instances was available to learn from. A rule-based approach focusing on a small set of relevant features was chosen for the remaining criteria.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The system was evaluated using microaveraged F measure. Overall, 4 machine algorithms, including support vector machine, logistic regression, naïve Bayesian classifier, and gradient tree boosting (GTB), were evaluated on the training data using 10–fold cross-validation. Overall, GTB demonstrated the most consistent performance. Its performance peaked when oversampling was used to balance the training data. The final evaluation was performed on previously unseen test data. On average, the F measure of 89.04% was comparable to 3 of the top ranked performances in the shared task (91.11%, 90.28%, and 90.21%). With an F measure of 88.14%, we significantly outperformed these systems (81.03%, 78.50%, and 70.81%) in identifying patients with advanced coronary artery disease.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The holdout evaluation provides evidence that our system was able to identify eligible patients for the given clinical trial with high accuracy. Our approach demonstrates how rule-based knowledge infusion can improve the performance of machine learning algorithms even when trained on a relatively small dataset.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>machine learning</kwd>
        <kwd>electronic medical records</kwd>
        <kwd>clinical trial</kwd>
        <kwd>eligibility determination</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Clinical trials are medical research studies focusing on a specific health intervention. They involve human participants to generate data on safety and efficacy as any new health intervention needs to comply with the Hippocratic Oath: “First, do no harm!” With this principle in mind, clinical trials leading up to regulatory approval are typically divided into 3 phases, each involving a significantly higher number of patients (see <xref rid="figure1" ref-type="fig">Figure 1</xref>). Phase I aims to answer the following question: Is the intervention safe? The first few healthy participants are given very low doses of the treatment and are monitored closely. If there are no major side effects, the dose is iteratively increased until an effective dose whose possible side effects that are deemed acceptable is reached. Phase II involves patients to determine whether the new intervention works or not. In other words, it assesses its efficacy while continually monitoring the side effects. Finally, in addition to safety and efficacy, phase III also tests the efficiency of the intervention by comparing it with other available interventions. When introducing control groups of participants, clinical trials need to ensure that they are as similar as possible to be able to attribute any findings to the interventions studied and not some other factors. Therefore, each clinical trial defines the eligibility criteria that describe characteristics that must be shared by all participants.</p>
        <p>Patient recruitment is universally recognized as a key determinant of success for clinical trials, yet they commonly fail to reach their recruitment goals [<xref ref-type="bibr" rid="ref1">1</xref>]. Almost a fifth of trials were terminated because they failed to recruit enough participants [<xref ref-type="bibr" rid="ref2">2</xref>], with less than one-fifth managing to reach their recruitment targets within the proposed time frames [<xref ref-type="bibr" rid="ref3">3</xref>]. Eligibility represents a major clinical domain barrier to participation [<xref ref-type="bibr" rid="ref4">4</xref>]. The eligibility criteria are often criticized for being too narrow, thus having a negative impact on recruitment rates and also the generalizability of findings. A stakeholder survey of the barriers to patient recruitment and possible solutions revealed identification of eligible patients using medical records and hospital-based registries and other databases as the key method to improve recruitment [<xref ref-type="bibr" rid="ref5">5</xref>]. Unfortunately, the complexities of the eligibility criteria do not allow them to be translated directly into readily executable database queries. Instead, they require careful analysis of information contained in the narrative sections of medical records. Manual screening of medical records is time consuming, thus negatively affecting the timeliness of the recruitment process. Text mining has a potential to provide a technical means for unclogging this bottleneck.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The three premarketing phases of a clinical trial.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e15980_fig1.png" alt-version="no" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>The problem of matching the eligibility criteria against their electronic medical records (EMRs) can be framed using a variety of natural language processing (NLP) tasks depending on the type and level of automation expected. In the context of decision making, automation can be applied to 4 classes of functions: information acquisition, information analysis, decision selection, and decision implementation [<xref ref-type="bibr" rid="ref6">6</xref>]. In our scenario, we focused on a clinician as a human operator who, given a collection of EMRs and a set of eligibility criteria, needs to decide which patients should be recruited to a given clinical trial. In this context, we can think of information acquisition as identification of information relevant to the eligibility criteria. This task can be automated by means of information retrieval (IR) or information extraction (IE).</p>
        <p>IR can be applied to both structured and unstructured components of the EMRs to retrieve relevant records or their parts. The usability of any IR system depends on two key factors: system effectiveness and user utility [<xref ref-type="bibr" rid="ref7">7</xref>]. A test collection of 56 topics based on patient statements (eg, signs, symptoms, and treatment) and inclusion/exclusion criteria (eg, patient’s demographics, laboratory test, and diagnosis) can be used to evaluate the effectiveness of IR for cohort selection [<xref ref-type="bibr" rid="ref8">8</xref>]. The utility of IR systems can be improved by designing an intuitive visual query interface easily used by clinical researchers [<xref ref-type="bibr" rid="ref9">9</xref>]. Both utility and effectiveness depend on how well the system incorporates domain-specific knowledge. An ontology can be used to support term disambiguation, term normalization, and subsumption reasoning. Most studies mapped textual elements to concepts in the Unified Medical Language System (UMLS) for normalization with few studies discussing the use of semantic Web technologies for phenotyping [<xref ref-type="bibr" rid="ref10">10</xref>]. For instance, the UMLS hierarchy can be used to expand a query searching for cancer to other related terms (eg, neuroblastoma and glioma). However, using such a broad hierarchy for unsupervised expansion can introduce many irrelevant terms, which can be detrimental to eligibility-screening performance [<xref ref-type="bibr" rid="ref11">11</xref>]. This problem can be reduced by using the UMLS to bootstrap creation of custom ontologies relevant to the problem at hand. For example, to identify patients with cerebral aneurysms, a domain-specific ontology was created by querying the UMLS for concepts related to the locations of aneurysms (eg, middle cerebral artery or anterior communicating artery), other clinical phenotypes related to cerebral aneurysms (eg, saccular aneurysm or subarachnoid hemorrhage), associated conditions (eg, polycystic kidney disease), and competing diagnoses (eg, arteriovenous malformation) [<xref ref-type="bibr" rid="ref12">12</xref>]. Where available, other relevant systems can be used to inform the development of domain-specific ontologies. For instance, the Epilepsy Data Extraction and Annotation uses a novel Epilepsy and Seizure Ontology, which is based on the International League Against Epilepsy classification system as the core knowledge resource [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
        <p>The complexity of clinical sublanguage may require new language modeling approaches to be able to formulate multilayered queries and customize the level of linguistic granularity [<xref ref-type="bibr" rid="ref13">13</xref>]. This approach to IR incorporates the output of other NLP systems to represent a document or a query using multiple aligned layers consisting of tokens, their part of speech, named entities with mappings to external knowledge sources, and syntactic dependencies among these elements. Other IR efforts focused on directing a clinician’s attention toward specific sentences that are relevant for eligibility determination [<xref ref-type="bibr" rid="ref14">14</xref>]. This is achieved by segmenting the natural language description of eligibility criteria into individual sentences, analyzing them further to identify domain-specific concepts, and using them to identify sentences in the EMRs that make references to these concepts. This approach is designed to work with categorical data but falls short when numerical data need to be interpreted. For instance, 5 numerical values are needed to diagnose a metabolic syndrome [<xref ref-type="bibr" rid="ref15">15</xref>]. Of these values, 3 (triglycerides, high-density lipoprotein cholesterol, and elevated fasting glucose) are stored in the laboratory information system, and as structured data are readily available for querying and comparison with referent values. However, in some systems, 2 values may be hidden in the narrative notes (elevated waist circumference and elevated blood pressure). Traditionally, IR approaches are based on the bag-of-words (BoW) model, which represents each document as an unordered collection of features that correspond to the words in a vocabulary for a given document collection. Therefore, by design, IR approaches will be ineffective when it comes to dealing with continuous variables. Conversely, IE based on simple regular expressions can be used to extract numerical values from text and make them amenable for further analysis and interpretation [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        <p>However, the technical feasibility of the IE process does not mean that all relevant attributes are necessarily documented in a single source as the previous example illustrates. For example, a study on case-finding algorithms for hepatocellular cancer discovered significant differences in performance between 2 types of documents (pathology and radiology reports) [<xref ref-type="bibr" rid="ref19">19</xref>]. It also revealed a significant difference between the narrative reports and coded fields. This raises an important aspect of the completeness of information recorded in an EMR [<xref ref-type="bibr" rid="ref15">15</xref>]. It has been established that case finding by the International Classification of Diseases, Ninth Revision (ICD-9) coding alone is not sufficient to reliably identify patients with a particular disease or risk factors [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. A few studies contrasted the utility of structured and unstructured information, with the NLP approaches usually demonstrating better results [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. In particular, the use of ICD-9 codes for patient phenotyping demonstrated markedly lower precision (or positive predictive value) [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. This finding is compatible with a hypothesis that ICD-9 codes are designed for billing purposes and as such may not capture the nuances of phenotypic characteristics in terms of information completeness, expressiveness, and granularity [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
        <p>The analysis of the strengths and weaknesses of both data sources together with practical experiments has led to a consensus that clinical narratives should be used in combination with structured data for eligibility screening [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. Therefore, data fusion is a key component of the information acquisition step in eligibility screening. It should by no means be limited to these 2 modalities of data. For example, clinical electroencephalography (EEG) is the most important investigation in the diagnosis and management of epilepsies. A multimodal patient cohort retrieval system has been designed to leverage the heterogeneous nature of EEG data by integrating EEG reports with EEG signal data [<xref ref-type="bibr" rid="ref29">29</xref>]. Though evidently important, data fusion techniques are beyond the scope of this study. Here, we focused exclusively on reviewing the methods used to mine clinical narratives for the purpose of eligibility screening. However, the awareness of the need for data fusion can help the reader realize the existence of an externally imposed upper bound on expected performance of text mining approaches.</p>
        <p>We have thus far discussed the role of IR and IE in the context of information acquisition. The clinician is still expected to review the retrieved information to decide who satisfies the eligibility criteria. Text mining can be used to support this process by automating information analysis and decision selection by means of feature extraction and text classification, respectively. Two NLP systems tailored to the clinical domain are most often used to extract rich linguistic and semantic features from the narrative found in EMRs: Medical Language Extraction and Encoding (MedLEE [<xref ref-type="bibr" rid="ref30">30</xref>]) [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] and clinical Text Analysis and Knowledge Extraction System (cTAKES [<xref ref-type="bibr" rid="ref31">31</xref>]) [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. They model the semantics by mapping text to the UMLS or a custom dictionary if required. Clinical text analysis needs to make fine-grained semantic distinctions as medical concepts may be negated, may describe someone other than the patient, and may be referring to time other than the present [<xref ref-type="bibr" rid="ref13">13</xref>]. MedLEE and cTAKES can not only identify concepts of interest but can also interpret their meaning in the context of negation, hedging, and specific sections. Both systems can also perform syntactic analysis to extract linguistic features such as part of speech and syntactic dependencies. Abbreviations are some of the most prominent features of clinical narratives. Unfortunately, both MedLEE and cTAKES demonstrated suboptimal performance in abbreviation recognition [<xref ref-type="bibr" rid="ref34">34</xref>], which may require development of bespoke solutions [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>Once the pertinent features have been extracted, they can be exploited by rule-based or machine learning approaches. A review of approaches to identifying patient cohorts using EMRs revealed that out of 97 studies, 24 described rule-based systems; 41 used statistical analyses, data mining, or machine learning; and 22 described hybrid systems [<xref ref-type="bibr" rid="ref10">10</xref>]. A minimal set of rules is sufficient to accurately extract highly standardized information from the narratives [<xref ref-type="bibr" rid="ref15">15</xref>]. Their development requires iterative consultation with a clinical expert [<xref ref-type="bibr" rid="ref26">26</xref>]. Nonetheless, a well-designed rule-based system can achieve good performance on cohort selection even with a small training dataset [<xref ref-type="bibr" rid="ref36">36</xref>], which remains a problem associated with supervised machine learning approaches. When relevant concepts can be accurately identified from clinical text, both rule-based and machine learning approaches demonstrate good performance, albeit it is slightly in favor of machine learning [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref33">33</xref>].</p>
        <p>A variety of supervised machine learning approaches have been used to support cohort selection, including support vector machines (SVMs) [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], decision trees [<xref ref-type="bibr" rid="ref22">22</xref>], Repeated Incremental Pruning to Produce Error Reduction, random forests [<xref ref-type="bibr" rid="ref25">25</xref>], C4.5 [<xref ref-type="bibr" rid="ref33">33</xref>], logistic regression (LR) [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], naïve Bayesian (NB) learning [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], perceptron [<xref ref-type="bibr" rid="ref37">37</xref>], conditional random fields [<xref ref-type="bibr" rid="ref19">19</xref>], and deep learning [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Unfortunately, few studies report systematic evaluation of a wide range of machine learning algorithms, thus offering little insight into the optimal performance of machine learning for cohort selection [<xref ref-type="bibr" rid="ref39">39</xref>]. Another issue associated with supervised learning is that of imbalanced data. The number of positive examples will typically vary significantly across the eligibility criteria. The data used for the 2018 National Natural Language Processing Clinical Challenge (n2c2) shared task on cohort selection for clinical trials provide a perfect illustration of this problem [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Yet, few approaches tackled this issue with different sampling approaches. Instead, they may resort to using machine learning approaches generally perceived to be the most robust for imbalanced data, for example, SVMs [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>].</p>
        <p>Our review of related work illustrates the ways in which the eligibility screening process can be automated. One study reported that the time for cohort identification was reduced significantly from a few weeks to a few seconds [<xref ref-type="bibr" rid="ref16">16</xref>]. Others reported the workload reduction with automated eligibility screening around 90% [<xref ref-type="bibr" rid="ref42">42</xref>] achieved a 450% increase in trial screening efficiency [<xref ref-type="bibr" rid="ref11">11</xref>]. Most recently, the patient screening time was reduced by 34%, allowing for the saved time to be redirected to activities that further streamlined teamwork among the clinical research coordinators [<xref ref-type="bibr" rid="ref43">43</xref>]. The same study showed that the numbers of subjects screened, approached, and enrolled were increased by 14.7%, 11.1%, and 11.1%, respectively. In this study, we aimed to illustrate the complexity of the eligibility screening problem and propose a way in which this task can be automated.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>System Overview</title>
        <p>In this paper, we describe Cardiff Cohort Selection System (c2s2) [<xref ref-type="bibr" rid="ref44">44</xref>], an open-source NLP system that, given a longitudinal patient record, performs binary classification against 13 eligibility criteria for a clinical trial. For each criterion in turn, the system determines whether a patient meets or does not meet a given criterion. The eligibility criteria were predefined by the organizers of the 2018 n2c2 shared task (see <xref ref-type="table" rid="table1">Table 1</xref>) that aimed to answer the question whether NLP systems can use narrative medical records to identify patients eligible for clinical trials.</p>
        <p>For the majority of criteria, a record needs to contain the supporting evidence for the corresponding patient to meet a given criterion, otherwise the criterion is considered <italic>not met</italic> (eg, if glycated hemoglobin [HbA<sub>1c</sub>] value is 4.7 or missing, then the criterion HBA<sub>1c</sub> is not met). The only 2 exceptions are the criteria concerning a patient’s ability to speak English and make their own medical decisions, which are assumed to be <italic>met</italic>, that is, the evidence of the contrary needs to be identified to overturn this assumption. Our system is designed to find and tag such evidence in text using a rule-based approach. A text classifier was trained on the tagged text for each criterion that had a sufficient number of both positive and negative representatives to learn from. Overall, the system consists of 5 modules whose functionality is outlined in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <p>The input to the system is a longitudinal patient record distributed as a single UTF-encoded text file, which contains multiple records generated across various health care encounters. Each individual record represents either a discharge summary or a correspondence between health care professionals [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Their content may cover patient demographics, progress notes, problems, prescribed medications, vital signs, past medical history, immunizations, laboratory data, and radiology reports. Individual records start with a line formatted as <italic>Record date: YYYY-MM-DD</italic> and are arranged in the ascending order by the record date. Other than that, there are no other restrictions on the format of individual records. Indeed, they may reflect a variety of different styles.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Description of the eligibility criteria, as provided in the annotation guidelines used for the National Natural Language Processing Clinical Challenge shared task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="179"/>
            <col width="595"/>
            <col width="132"/>
            <col width="94"/>
            <thead>
              <tr valign="top">
                <td>ID</td>
                <td>Criterion</td>
                <td>Time period</td>
                <td>Default</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ABDOMINAL</td>
                <td>Intra-abdominal surgery, small or large intestine resection, or small bowel obstruction</td>
                <td>Any</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>ADVANCED-CAD</td>
                <td>Advanced artery disease</td>
                <td>Present</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>ALCOHOL-ABUSE</td>
                <td>Alcohol use exceeds weekly recommended limits</td>
                <td>Present</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>ASP-FOR-MI</td>
                <td>Use of aspirin to prevent myocardial infarction</td>
                <td>Any</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>CREATININE</td>
                <td>Serum creatinine is above the upper limit of normal</td>
                <td>Any</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>DIETSUPP-2MOS</td>
                <td>Use of dietary supplements (excluding vitamin D)</td>
                <td>Past 2 months</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>DRUG-ABUSE</td>
                <td>Drug abuse</td>
                <td>Any</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>ENGLISH</td>
                <td>Speaks English</td>
                <td>Any</td>
                <td>Met</td>
              </tr>
              <tr valign="top">
                <td>HBA<sub>1c</sub></td>
                <td>Glycated hemoglobin value is between 6.5 and 9.5</td>
                <td>Any</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>KETO-1YR</td>
                <td>Diagnosed with ketoacidosis</td>
                <td>Past year</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>MAJOR-DIABETES</td>
                <td>Major diabetes-related complication</td>
                <td>Any</td>
                <td>Not met</td>
              </tr>
              <tr valign="top">
                <td>MAKES-DECISIONS</td>
                <td>Able to make decisions for themselves</td>
                <td>Present</td>
                <td>Met</td>
              </tr>
              <tr valign="top">
                <td>MI-6MOS</td>
                <td>Myocardial infarction</td>
                <td>Past 6 months</td>
                <td>Not met</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>System architecture.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e15980_fig2.png" alt-version="no" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Preprocessing</title>
        <p>In addition to standard preprocessing operations (see <xref rid="figure2" ref-type="fig">Figure 2</xref>), special consideration is given to punctuation. Its use in clinical narratives proved to affect the results of text segmentation algorithms developed for general language [<xref ref-type="bibr" rid="ref47">47</xref>]. On one hand, clinical narratives commonly use punctuation as means of abbreviation (see <xref ref-type="table" rid="table2">Table 2</xref> for examples). Such use of punctuation may easily be misinterpreted as a sentence terminator. For instance, phrases such as “q. Sunday,” “vit. D,” and “Dr. Harold Nutter” feature a period followed by an uppercase letter, a pattern that is commonly exploited in both rule-based and machine learning approaches to split sentences. Segmentation errors can propagate onto the subsequent stages of text processing, resulting in the loss of syntactic dependencies between related words and consequently their contribution to the overall semantics. For example, incorrectly splitting a sentence within the phrase “vit. D” would effectively erase this mention of “vitamin D,” a named entity of direct relevance to the eligibility criterion DIETSUPP-2MOS (see <xref ref-type="table" rid="table1">Table 1</xref>). To prevent parsing errors of this type, pattern-matching rules were developed to identify and remove punctuation used in such contexts before performing sentence segmentation (see <xref ref-type="table" rid="table2">Table 2</xref> for examples).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>A selection of rule-based punctuation removal examples.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="400"/>
            <col width="400"/>
            <thead>
              <tr valign="top">
                <td>Rule target</td>
                <td>Input</td>
                <td>Output</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Prescription</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>q. a.m.</p>
                    </list-item>
                    <list-item>
                      <p>q. Sunday</p>
                    </list-item>
                    <list-item>
                      <p>tab.</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>qam</p>
                    </list-item>
                    <list-item>
                      <p>q Sunday</p>
                    </list-item>
                    <list-item>
                      <p>tab</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Vitamin</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>vit. D</p>
                    </list-item>
                    <list-item>
                      <p>MVit.</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>vit D</p>
                    </list-item>
                    <list-item>
                      <p>MVit</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Personal title</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Dr. Harold Nutter</p>
                    </list-item>
                    <list-item>
                      <p>Harold Nutter, Ph.D.</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Dr Harold Nutter</p>
                    </list-item>
                    <list-item>
                      <p>Harold Nutter, PhD</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Shorthand x</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>hx. of migraines</p>
                    </list-item>
                    <list-item>
                      <p>sx. of depression</p>
                    </list-item>
                    <list-item>
                      <p>Rx. for cpap</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>hx of migraines</p>
                    </list-item>
                    <list-item>
                      <p>sx of depression</p>
                    </list-item>
                    <list-item>
                      <p>Rx for cpap</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Species name</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>E. coli</p>
                    </list-item>
                    <list-item>
                      <p>C. diff</p>
                    </list-item>
                    <list-item>
                      <p>H. pylori</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>E coli</p>
                    </list-item>
                    <list-item>
                      <p>C diff</p>
                    </list-item>
                    <list-item>
                      <p>H pylori</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>Clinical narratives also feature prevalent use of short formulaic statements such as field:value combinations (eg, <italic>Substance abuse: none</italic>) and itemized lists (see <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> for an example).</p>
        <p>Such statements are not commonly terminated by means of punctuation. When used consecutively, this can often result in independent statements being incorrectly grouped together in a single sentence. Their intersentential co-occurrence may later be easily confused with relatedness. Consider, for instance, amalgamating the above itemized list into a continuous sequence “<italic>s/p cerebral infarction myocardial scan normal blood pressure today 190/108.”</italic> It could lead to incorrectly recognizing <italic>infarction</italic> as a <italic>myocardial</italic> one and the <italic>blood pressure</italic> as <italic>normal</italic>, when in fact, the <italic>infarction</italic> is <italic>cerebral,</italic> and the <italic>blood pressure</italic> is <italic>abnormally high</italic>. Acting preemptively, we perform document layout analysis to identify itemized lists and insert punctuation where appropriate before performing sentence segmentation. Consequently, this will enforce independent fragments to be interpreted as separate sentences.</p>
        <p>Finally, to streamline subsequent text analysis, we use pattern-matching rules to fully expand enclitics and special characters. For example, <italic>couldn't</italic> is expanded to <italic>could not</italic>, whereas <italic>con't</italic> is expanded to <italic>continue</italic>. This will later simplify identification of negated expressions. Similarly, to prune the number of IE rules, we lexicalized a relevant set of special characters. For example, <italic>BUN/Cr ratio is</italic> &#62;<italic>20</italic> would become <italic>BUN/Cr ratio is greater than 20</italic>.</p>
        <boxed-text id="box1" position="float">
          <title>An example of assessment recorded as an itemized list.</title>
          <list list-type="bullet">
            <list-item>
              <p>
                <italic>s/p cerebral infarction</italic>
              </p>
            </list-item>
            <list-item>
              <p>
                <italic>myocardial scan normal</italic>
              </p>
            </list-item>
            <list-item>
              <p>
                <italic>blood pressure today 190/108</italic>
              </p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Normalization</title>
        <p>Text normalization is performed with a similar intent: to simplify subsequent text analysis. It involves mapping of a selected subset of words and phrases onto their representatives, which can be either a preferred synonym or a hypernym (see <xref ref-type="table" rid="table3">Table 3</xref> for examples). Special consideration is given to acronyms and abbreviations as they are known to have a major impact on retrieval of relevant information. First, disambiguation is performed for a small subset of abbreviations of direct relevance for the given classification tasks. Examples include <italic>ca</italic> (<italic>calcium</italic> vs <italic>cancer</italic>), <italic>mg</italic> (<italic>magnesium</italic> vs <italic>milligram</italic>), and <italic>CR</italic> (<italic>creatinine</italic> vs <italic>controlled release</italic>). A context-sensitive approach is used to select an appropriate interpretation. For example, if CR is used in combination with words such as <italic>tablet</italic> or <italic>capsule</italic>, then it is assumed to refer to <italic>controlled release</italic>.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Examples of text normalization.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="126"/>
            <col width="341"/>
            <col width="312"/>
            <col width="221"/>
            <thead>
              <tr valign="top">
                <td>Example</td>
                <td>Surface forms</td>
                <td>Normalized form</td>
                <td>Relevance</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>mom, father, sister</td>
                <td>family member</td>
                <td>filtering</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>FH, FHx, FamHx</td>
                <td>family history</td>
                <td>filtering</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>whiskey, vodka, beer</td>
                <td>alcohol</td>
                <td>ALCOHOL-ABUSE</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>Lantus, Humalog, NPH</td>
                <td>insulin</td>
                <td>MAJOR-DIABETES</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>DM2, DMII, NIDDM</td>
                <td>diabetes mellitus 2</td>
                <td>MAJOR-DIABETES</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>CRRT, CRRTX</td>
                <td>continuous renal replacement therapy</td>
                <td>MAJOR-DIABETES</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>ARF</td>
                <td>acute renal failure</td>
                <td>MAJOR-DIABETES</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>CKD</td>
                <td>chronic kidney disease</td>
                <td>MAJOR-DIABETES</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>BB, bblocker, betablocker</td>
                <td>beta blocker</td>
                <td>ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>ECG, EKG</td>
                <td>electrocardiogram</td>
                <td>ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>11</td>
                <td>ICD</td>
                <td>implantable cardioverter defibrillator</td>
                <td>ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>12</td>
                <td>CVD</td>
                <td>cardiovascular disease</td>
                <td>ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>13</td>
                <td>MI, heart attack</td>
                <td>myocardial infarction</td>
                <td>MI-6MOS, ASP-FOR-MI, ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>14</td>
                <td>STEMI</td>
                <td>ST elevation myocardial infarction</td>
                <td>MI-6MOS, ASP-FOR-MI, ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>15</td>
                <td>ASA, ECASA</td>
                <td>aspirin</td>
                <td>ASP-FOR-MI</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>Other acronyms and abbreviations of interest are then expanded using a bespoke lexicon (&#62;500 entries) developed specifically for this task. To bootstrap the lexicon construction, the raw training data were used to analyze frequently occurring words. Orthographic features (uppercase typeset, eg, <italic>STEMI</italic>, or the use of punctuation, eg, <italic>q.a.m.</italic> or <italic>r/o</italic>) and spelling checker (eg, <italic>inpt</italic>) were used to identify potential acronyms and abbreviations as <italic>unknown</italic> words that are also relatively short. Medical expertise was used to identify the corresponding full forms. Simple Concordance Program [<xref ref-type="bibr" rid="ref48">48</xref>] was used to verify manually whether the proposed full forms apply across the majority of contexts within the training data to enable the use a context-free approach for acronym and abbreviation expansion.</p>
        <p>The only acronym exempt from expansion was <italic>CCB</italic>. In fact, all occurrences of <italic>calcium channel blocker</italic> were replaced by the corresponding acronym. The reason behind this decision is the fact that both <italic>calcium</italic> as a supplement and <italic>calcium channel blocker</italic> often occur in similar context (eg, medication list). As one of the eligibility criteria was concerned with dietary supplementation (see DIETSUPP-2MOS in <xref ref-type="table" rid="table1">Table 1</xref>), this reduced the risk of interpreting the latter mention of <italic>calcium</italic> as a supplement.</p>
        <p>To illustrate the extent to which text normalization can simplify its subsequent analysis, we can use examples provided in <xref ref-type="table" rid="table3">Table 3</xref>. For example, by replacing the surface forms in Example 1 by their hypernym and expanding abbreviations in Example 2, we can simply use the occurrence of the word <italic>family</italic> to filter out sentences or the whole sections that refer to family members. Consider, for example, the original text given in <xref ref-type="boxed-text" rid="box2">Textbox 2</xref> and its normalized counterpart in <xref ref-type="boxed-text" rid="box3">Textbox 3</xref>.</p>
        <boxed-text id="box2" position="float">
          <title>An original example of family history.</title>
          <p>
            <italic>FH: Mom w/ PM at age 50, died of MI at 71. Father w/ EtOH, HTN. Sister w/ 4 miscarriages.</italic>
          </p>
        </boxed-text>
        <boxed-text id="box3" position="float">
          <title>A normalized example of family history.</title>
          <p>
            <italic>Family history: Family member with pacemaker at age 50, died of myocardial infarction at 71. Family member with alcohol abuse, hypertension. Family member with 4 miscarriages.</italic>
          </p>
        </boxed-text>
        <p>By filtering out references to family members, we are effectively removing the mentions of <italic>myocardial infarction</italic> and <italic>alcohol abuse</italic> that do not apply to the given patient. Consequently, we can use the remaining references to <italic>myocardial infarction</italic> and <italic>alcohol abuse</italic>, if any, as evidence for eligibility criteria MI-6MOS and ALCOHOL-ABUSE (see <xref ref-type="table" rid="table1">Table 1</xref>). Similarly, by mapping alcoholic beverages in Example 3 to their hypernym, the subsequent analysis related to the eligibility criterion ALCOHOL-ABUSE (see <xref ref-type="table" rid="table1">Table 1</xref>) can simply focus on any mention of the word <italic>alcohol</italic>. Examples 4 and 5 show that 2 keywords, <italic>insulin</italic> and <italic>diabetes</italic>, can be used to look for evidence of diabetes. Once unpacked from the corresponding acronyms (Example 5), the word <italic>diabetes</italic> becomes accessible to text analysis. Similarly, words <italic>renal</italic> and <italic>kidney</italic> become visible after expanding acronyms in Examples 5-7. Knowing that diabetes is a major risk factor for kidney disease, we can subsequently use close occurrences of the word <italic>diabetes</italic> to either of the words <italic>renal</italic> or <italic>kidney</italic> as evidence for the eligibility criterion MAJOR-DIABETES (see <xref ref-type="table" rid="table1">Table 1</xref>). Similar to lexical analysis, morphological analysis can be used to identify features relevant to the given eligibility criteria. Normalized forms in Examples 10-14 related to ADVANCED-CAD (see <xref ref-type="table" rid="table1">Table 1</xref>) incorporate a morpheme <italic>cardi(o)</italic>, which signifies that these medical concepts are related to the heart, which can be affected by coronary artery disease.</p>
      </sec>
      <sec>
        <title>Filtering</title>
        <p>Once the text has been regularized by means of preprocessing and normalization, information not directly relevant to the given classification tasks is filtered out. We focus on 4 types of such information:</p>
        <list list-type="order">
          <list-item>
            <p>negation, for example,<italic> ruled out for MI by enzymes</italic></p>
          </list-item>
          <list-item>
            <p>family history, for example,<italic> mother died at age 62 of a heart attack</italic>
    </p>
          </list-item>
          <list-item>
            <p>allergies, for example,<italic> Allergies: aspirin—GI upset</italic></p>
          </list-item>
          <list-item>
            <p>time window, for example,<italic> records older than the last 6 months</italic></p>
          </list-item>
        </list>
        <p>Removal of such information simplifies subsequent classification by allowing the use of a BoW approach. For example, by not considering the first 2 examples, the risk of misclassifying a patient as having a <italic>myocardial infarction</italic> is reduced. Similarly, by removing the third example from consideration, the risk of misclassifying a patient as one taking <italic>aspirin</italic> to prevent <italic>myocardial infarction</italic> is also reduced. Finally, as some of the eligibility criteria were time dependent (namely, ALCOHOL-ABUSE, DIETSUPP-2MOS, KETO-1YR, MAKES-DECISIONS, and MI-6MOS—see <xref ref-type="table" rid="table1">Table 1</xref> for definitions), we identified dates of individual medical records to extract the ones relevant to the given time windows and stored them separately for use by the corresponding classifiers.</p>
        <p>We used a set of regular expressions, which are available from the c2s2 GitHub repository [<xref ref-type="bibr" rid="ref42">42</xref>], to identify the 4 types of information considered. Regular expressions used to identify negation are based on the NegEx algorithm for identifying negated concepts in clinical notes [<xref ref-type="bibr" rid="ref49">49</xref>].</p>
      </sec>
      <sec>
        <title>Feature Extraction</title>
        <p>Thus far, we reduced the noise and lexical variability in the data by means of filtering and normalization. This is expected to improve the performance of a supervised classifier. Another action that stands to improve the classification performance when trained on a relatively small dataset is that of reducing dimensionality of a BoW representation by aggregating related features into a single representative. In its simplest form, feature aggregation can be achieved by abstracting words into semantic classes. Where domain ontology is available, such abstraction can be automated by exploiting its taxonomic structure. The Semantic Network of the UMLS can be used to automatically abstract words into semantic types. However, as examples given in <xref ref-type="table" rid="table4">Table 4</xref> illustrate, the UMLS semantic types are too broad in the context of eligibility criteria described in <xref ref-type="table" rid="table1">Table 1</xref>. For example, abstracting Examples 1-4 into pharmacologic substance would dilute rather than distil relevant information. A finer-grained abstraction tuned for the given eligibility criteria would be more appropriate (see the last 2 columns in <xref ref-type="table" rid="table4">Table 4</xref>), but it would also incur some knowledge engineering overhead. However, the widespread availability of Web resources that summarize information pertaining to health and well-being can greatly reduce such overhead. We defined a total of 8 abstraction categories and assembled the corresponding lexica using online resources (see <xref ref-type="table" rid="table5">Table 5</xref>).</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Examples of word abstraction.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="300"/>
            <col width="240"/>
            <col width="180"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td>Example</td>
                <td>Surface forms</td>
                <td>Semantic type</td>
                <td>Abstraction</td>
                <td>Relevance</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>marijuana, heroin, ecstasy</td>
                <td>Pharmacologic substance</td>
                <td>Illicit drug</td>
                <td>DRUG-ABUSE</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>beta blocker, nitroglycerin, CCB</td>
                <td>Pharmacologic substance</td>
                <td>Heart medication</td>
                <td>ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>crestor, advicor, compactin</td>
                <td>Pharmacologic substance</td>
                <td>Statin</td>
                <td>ADVANCED-CAD</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>vitamin C, calcium, primrose oil</td>
                <td>Pharmacologic substance</td>
                <td>Supplement</td>
                <td>DIETSUPP-2MOS</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>turmeric, green tea, cinnamon</td>
                <td>Food</td>
                <td>Supplement</td>
                <td>DIETSUPP-2MOS</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>vodka, beer, wine</td>
                <td>Food</td>
                <td>Alcohol</td>
                <td>ALCOHOL-ABUSE</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Rule-based feature extraction.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="250"/>
            <col width="260"/>
            <col width="340"/>
            <thead>
              <tr valign="top">
                <td>Tag</td>
                <td>Feature</td>
                <td>Extraction<sup>a</sup></td>
                <td>Examples<sup>b</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MEDRX</td>
                <td>Prescription instructions</td>
                <td>Regular expressions</td>
                <td>po q4h prn</td>
              </tr>
              <tr valign="top">
                <td>KIDMED</td>
                <td>Kidney medication</td>
                <td>Lexicon (221 entries)<sup>c</sup></td>
                <td>Thymoglobulin</td>
              </tr>
              <tr valign="top">
                <td>BRPMED</td>
                <td>Blood pressure medication</td>
                <td>—<sup>d</sup></td>
                <td>Avapro</td>
              </tr>
              <tr valign="top">
                <td>HRTMED</td>
                <td>Heart medication</td>
                <td>—</td>
                <td>Plavix</td>
              </tr>
              <tr valign="top">
                <td>HRTTRT</td>
                <td>Heart treatment</td>
                <td>Regular expressions</td>
                <td>Re<italic>catheteriz</italic>ation</td>
              </tr>
              <tr valign="top">
                <td>HRTISC</td>
                <td>Heart ischemia</td>
                <td>Regular expressions</td>
                <td>Electro<italic>cardio</italic>gram demonstrated <italic>ischemic</italic> changes</td>
              </tr>
              <tr valign="top">
                <td>HRTANG</td>
                <td>Angina</td>
                <td>Regular expressions</td>
                <td>Chest wall heaviness</td>
              </tr>
              <tr valign="top">
                <td>HRTCAD</td>
                <td>Any of the HRT tags above + explicit references to CAD</td>
                <td>Regular expressions</td>
                <td>Given his extensive cardiac history</td>
              </tr>
              <tr valign="top">
                <td>ASPFMI</td>
                <td>Aspirin for heart problems</td>
                <td>Regular expressions</td>
                <td>Start on heparin <italic>HRT</italic>MED and <italic>aspirin</italic> and take to <italic>HRT</italic>TRT catheterization laboratory</td>
              </tr>
              <tr valign="top">
                <td>SPLMNT</td>
                <td>Supplement (strong evidence)</td>
                <td>Lexicon (67 entries) + regular expressions</td>
                <td>Ibuprofen 800 mg <italic>MED</italic>RX <italic>potassium</italic> chloride 10 meq <italic>MED</italic>RX lasix 20 mg <italic>MED</italic>RX</td>
              </tr>
              <tr valign="top">
                <td>DFCNCY</td>
                <td>Supplement (weak evidence)</td>
                <td>Lexicon (27 entries) + regular expressions</td>
                <td><italic>Iron deficien</italic>cy anemia</td>
              </tr>
              <tr valign="top">
                <td>MNTCAP</td>
                <td>Mental capacity</td>
                <td>Regular expressions</td>
                <td>Increasing <italic>disorientat</italic>ion and visual <italic>hallucinat</italic>ions</td>
              </tr>
              <tr valign="top">
                <td>DRGADD</td>
                <td>Substance abuse</td>
                <td>Lexicon (17 entries) + regular expressions</td>
                <td>History of <italic>cocaine</italic> abuse</td>
              </tr>
              <tr valign="top">
                <td>NOENGL</td>
                <td>Does not speak English</td>
                <td>Lexicon (66 entries) + regular expressions</td>
                <td>An <italic>Indonesian speaking</italic> 85-year-old male</td>
              </tr>
              <tr valign="top">
                <td>ALCABS</td>
                <td>Alcohol abuse</td>
                <td>Lexicon (7 entries) + regular expressions</td>
                <td><italic>Alcoholism</italic> 10 years ago</td>
              </tr>
              <tr valign="top">
                <td>ALCSTP</td>
                <td>Stopped drinking alcohol</td>
                <td>Regular expressions</td>
                <td>Alcoholism 10 <italic>years ago</italic></td>
              </tr>
              <tr valign="top">
                <td>KETACD</td>
                <td>Ketoacidosis</td>
                <td>Regular expressions</td>
                <td>Ketones positive</td>
              </tr>
              <tr valign="top">
                <td>KIDDAM</td>
                <td>Kidney problems</td>
                <td>Regular expressions</td>
                <td><italic>Worse</italic>ning <italic>renal</italic> dys<italic>function</italic></td>
              </tr>
              <tr valign="top">
                <td>DMCMPL</td>
                <td>Diabetic complications</td>
                <td>Regular expressions</td>
                <td><italic>Diabet</italic>es mellitus related retino<italic>pathy</italic>/neuro<italic>pathy</italic></td>
              </tr>
              <tr valign="top">
                <td>ABDMNL</td>
                <td>Abdominal surgery or small bowel obstruction</td>
                <td>Regular expressions</td>
                <td>Gastric <italic>laparo</italic>scopic bypass surgery</td>
              </tr>
              <tr valign="top">
                <td>HIGHCRT</td>
                <td>High creatinine</td>
                <td>Regular expressions + information extraction</td>
                <td>Blood urea nitrogen/<italic>creatinine</italic> of 21/<italic>1.7</italic></td>
              </tr>
              <tr valign="top">
                <td>GLYHMG</td>
                <td>Glycated hemoglobin in a given interval</td>
                <td>Information extraction</td>
                <td><italic>HbA</italic><sub>1c</sub> one month ago was <italic>6.7</italic></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>All lexicons and regular expressions are available from the c2s2 GitHub repository [<xref ref-type="bibr" rid="ref44">44</xref>].</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>Italic typeset is used to indicate the types of text features targeted by lexicons and regular expressions.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>KIDMED, BRPMED, HRTMED are organized into a single lexicon of 221 entries.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Once the BoW representation is passed onto a supervised classifier, the context of individual words will be lost. For instance, blood tests frequently feature essential minerals such as calcium, potassium, and iron, which can also be prescribed under the same names as supplements. The BoW approach will take these names out of context, keeping their frequency as the only information about them. Conversely, simple pattern analysis can be used to differentiate between the 2 types of context. For example, we can model prescription instructions using regular expressions (see <xref ref-type="table" rid="table5">Table 5</xref>) and tag this information in text in the form of a token (eg, MEDRX) that is lexically distinguishable from other tokens. We can subsequently apply another regular expression to find mentions of essential minerals in the close proximity to the MEDRX token and tag such mentions using another special-purpose tag (eg, SPLMNT). When we now apply the BoW approach, the token SPLMNT, treated as any other text token, will represent a feature that preserves relevant contextual information. Supervised machine learning algorithms can then take advantage of such a feature in combination with the standard BoW features. Regular expressions are used to embed a total of 18 context-sensitive features into text (see <xref ref-type="table" rid="table5">Table 5</xref>).</p>
        <p>Regular expressions can be used to model categorical references to information relevant to the given eligibility criteria. For example, regular expressions can be used to link the word <italic>creatinine</italic> with a stem <italic>elev</italic>-in the phrase <italic>a mildly elevated creatinine</italic> and use it as an indication for meeting the eligibility criterion CREATININE (see <xref ref-type="table" rid="table1">Table 1</xref>). However, knowing whether serum creatinine is above the upper limit of normal in a phrase such as “blood urea nitrogen and creatinine ratio of 40 and 1.0 respectively” requires not only extracting the correct numerical value (1.0) but also comparing it with the reference value (1.5). Two eligibility criteria, CREATININE and HBA<sub>1c</sub>, require extraction of numerical information and its subsequent analysis, as indicated in <xref ref-type="table" rid="table5">Table 5</xref>. As before, the outcome of such context-sensitive analysis is embedded back into the text for further exploitation by supervised machine learning.</p>
        <p>Overall, a total of 22 tags described in <xref ref-type="table" rid="table5">Table 5</xref> were chosen so that they can be lexically and orthographically distinguishable from other words upon their imputation into the processed text. The corresponding features are extracted incrementally in the order given in <xref ref-type="table" rid="table5">Table 5</xref> and, when appropriate, used to support extraction of other features. For example, knowing that heparin is a heart medication (indicated by the tag HRTMED—see <xref ref-type="table" rid="table5">Table 5</xref>) can be used to infer that, when aspirin is taken together with heparin, it is likely to be used as prophylaxis for the prevention of cardiovascular events such as myocardial infarction (indicated by the tag ASPFMI—see <xref ref-type="table" rid="table5">Table 5</xref>).</p>
      </sec>
      <sec>
        <title>Classification</title>
        <p>This module consists of 13 binary classifiers, 1 for each eligibility criterion (see <xref ref-type="table" rid="table1">Table 1</xref>). The distribution of class labels in the training data informed the choice of a classification method. Supervised machine learning was chosen wherever a sufficient number of both positive and negative instances were available to learn from (see <xref rid="figure3" ref-type="fig">Figure 3</xref>). A rule-based approach focusing on a small set of relevant features was chosen for the remaining criteria (see <xref ref-type="table" rid="table6">Table 6</xref>). The corresponding classification rules were based on a relevant set of manually engineered features described earlier in <xref ref-type="table" rid="table5">Table 5</xref>. Each rule was defined as a function of these features and a threshold value that maximizes the class separation, both chosen manually. The only exception was associated with the criterion MI-6MOS, where the final rule was induced from the training data in the form of a decision tree using a manually selected set of features.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Distribution of class labels.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e15980_fig3.png" alt-version="no" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Features used in rule-based classification.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>ID</td>
                <td>Features</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ALCOHOL-ABUSE</td>
                <td>ALCABS, ALCSTP</td>
              </tr>
              <tr valign="top">
                <td>DRUG-ABUSE</td>
                <td>DRGADD</td>
              </tr>
              <tr valign="top">
                <td>ENGLISH</td>
                <td>NOENGL</td>
              </tr>
              <tr valign="top">
                <td>KETO-1YR</td>
                <td>KETACD</td>
              </tr>
              <tr valign="top">
                <td>MAKES-DECISIONS</td>
                <td>MNTCAP</td>
              </tr>
              <tr valign="top">
                <td>MI-6MOS</td>
                <td>BRPMED, HRTMED, HRTTRT, HRTISC, HRTANG, HRTCAD, ASPFMI</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>Note that the numerical values used in criteria CREATININE and HBA<sub>1c</sub> were also extracted using a rule-based approach. However, in a longitudinal report, different values may be reported at different time points. In the absence of clear guidelines, we used machine learning on top of IE to determine automatically from the training data how to deal with such cases.</p>
        <p>A machine learning approach was used for all other criteria. According to the <italic>no free lunch</italic> theorem [<xref ref-type="bibr" rid="ref39">39</xref>], there is no universally best learning algorithm. In other words, the performance of machine algorithms depends not only on a specific computational task at hand but also on the properties of the data that characterize the problem. To compare the performance of different algorithms, we used 10–fold cross-validation experiments. We chose a representative algorithm from 4 major categories: function-based learning, regression analysis, probabilistic learning, and ensemble learning. Specific algorithms chosen were SVM with radial basis function kernel, LR, NB classifier, and gradient tree boosting (GTB), respectively. In our experiments, we used implementations of the first 3 algorithms in scikit-learn, an open-source Python library for data analysis and modeling [<xref ref-type="bibr" rid="ref50">50</xref>]. Experiments with GTB were performed using XGBoost, an open-source software library that implements a gradient boosting framework for Python [<xref ref-type="bibr" rid="ref51">51</xref>]. All experiments were performed with the default parameter values.</p>
        <p>We trained all classifiers using single words and/or bigrams as features with and without feature selection based on L1 regularized linear SVM. The overall performance was statistically indistinguishable across different types of features used. Therefore, we opted for a simple BoW approach with feature selection for efficiency reasons. To evaluate the impact of the class imbalance on the classification performance, we balanced the training data using random undersampling and oversampling with default parameters from scikit-learn [<xref ref-type="bibr" rid="ref50">50</xref>].</p>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref> summarizes the performance in terms of microaveraged <italic>F</italic> measure. Overall, GTB demonstrated the most consistent performance. Its performance peaked when oversampling was used to balance the training data. GTB is an ensemble classifier over a set of simple decision trees, which are varied according to specific parameter settings (learning rate and maximum tree depth). Having chosen GTB as the learning method, we optimized its parameters by performing grid search on learning rate (0.001-0.5) and maximum tree depth (2-10) using the oversampled training data. The learning rate of 0.02 and maximum depth of 10 were chosen for the holdout evaluation described in the next section.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Summary of cross-validation results. SVM:support vector machines; LR: logistic regression; NB: naïve Bayesian; GTB: gradient tree boosting; HBA<sub>1c</sub>:glycated hemoglobin.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e15980_fig4.png" alt-version="no" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The results of classification experiments on previously unseen test data are summarized in <xref ref-type="table" rid="table7">Table 7</xref>. The evaluation results were calculated using a script released by the organizers of the 2018 n2c2 shared task. We used the best results from 3 related studies as the baseline. They used rule-based [<xref ref-type="bibr" rid="ref36">36</xref>], hybrid [<xref ref-type="bibr" rid="ref18">18</xref>], and hierarchical neural network (HNN) [<xref ref-type="bibr" rid="ref38">38</xref>] approaches. We interpret the results for each classifier separately.</p>
      <table-wrap position="float" id="table7">
        <label>Table 7</label>
        <caption>
          <p>Detailed holdout test results.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="230"/>
          <col width="80"/>
          <col width="80"/>
          <col width="70"/>
          <col width="0"/>
          <col width="70"/>
          <col width="70"/>
          <col width="80"/>
          <col width="0"/>
          <col width="90"/>
          <col width="0"/>
          <col width="70"/>
          <col width="90"/>
          <col width="70"/>
          <thead>
            <tr valign="top">
              <td rowspan="2">ID</td>
              <td colspan="4">Met<sup>a</sup></td>
              <td colspan="4">Not met<sup>a</sup></td>
              <td colspan="2">Overall</td>
              <td colspan="2">Baseline<sup>b</sup></td>
              <td>c2s2<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>P<sup>d</sup> (%)</td>
              <td>R<sup>e</sup> (%)</td>
              <td>F<sup>f</sup> (%)</td>
              <td colspan="2">P (%)</td>
              <td>R (%)</td>
              <td>F (%)</td>
              <td colspan="2">F (%)</td>
              <td colspan="2">F (%)</td>
              <td>System</td>
              <td>Rank</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>ABDOMINAL</td>
              <td>64.86</td>
              <td>80.00</td>
              <td>71.64</td>
              <td colspan="2">87.76</td>
              <td>76.79</td>
              <td>81.90</td>
              <td colspan="2">76.77</td>
              <td colspan="2">90.64</td>
              <td>Rules</td>
              <td>4</td>
            </tr>
            <tr valign="top">
              <td>ADVANCED-CAD</td>
              <td>83.02</td>
              <td>97.78</td>
              <td>89.80</td>
              <td colspan="2">96.97</td>
              <td>78.05</td>
              <td>86.49</td>
              <td colspan="2">88.14</td>
              <td colspan="2">88.14</td>
              <td>c2s2</td>
              <td>1</td>
            </tr>
            <tr valign="top">
              <td>ALCOHOL-ABUSE</td>
              <td>22.22</td>
              <td>66.67</td>
              <td>33.33</td>
              <td colspan="2">98.70</td>
              <td>91.57</td>
              <td>95.00</td>
              <td colspan="2">64.17</td>
              <td colspan="2">
                <italic>89.70</italic>
              </td>
              <td>Hybrid</td>
              <td>2</td>
            </tr>
            <tr valign="top">
              <td>ASP-FOR-MI</td>
              <td>87.67</td>
              <td>94.12</td>
              <td>90.78</td>
              <td colspan="2">69.23</td>
              <td>50.00</td>
              <td>58.06</td>
              <td colspan="2">74.42</td>
              <td colspan="2">77.34</td>
              <td>HNN<sup>g</sup></td>
              <td>2</td>
            </tr>
            <tr valign="top">
              <td>CREATININE</td>
              <td>80.00</td>
              <td>83.33</td>
              <td>81.63</td>
              <td colspan="2">93.44</td>
              <td>91.94</td>
              <td>92.68</td>
              <td colspan="2">87.16</td>
              <td colspan="2">89.75</td>
              <td>Rules</td>
              <td>2</td>
            </tr>
            <tr valign="top">
              <td>DIETSUPP-2MOS</td>
              <td>78.85</td>
              <td>93.18</td>
              <td>85.42</td>
              <td colspan="2">91.18</td>
              <td>73.81</td>
              <td>81.58</td>
              <td colspan="2">83.50</td>
              <td colspan="2">89.53</td>
              <td>Hybrid</td>
              <td>4</td>
            </tr>
            <tr valign="top">
              <td>DRUG-ABUSE</td>
              <td>40.00</td>
              <td>66.67</td>
              <td>50.00</td>
              <td colspan="2">98.77</td>
              <td>96.39</td>
              <td>97.56</td>
              <td colspan="2">73.78</td>
              <td colspan="2">
                <italic>92.55</italic>
              </td>
              <td>Hybrid</td>
              <td>2</td>
            </tr>
            <tr valign="top">
              <td>ENGLISH</td>
              <td>91.25</td>
              <td>100.00</td>
              <td>95.42</td>
              <td colspan="2">100.00</td>
              <td>46.15</td>
              <td>63.16</td>
              <td colspan="2">79.29</td>
              <td colspan="2">97.66</td>
              <td>Hybrid</td>
              <td>4</td>
            </tr>
            <tr valign="top">
              <td>HBA<sub>1c</sub></td>
              <td>100.00</td>
              <td>82.86</td>
              <td>90.62</td>
              <td colspan="2">89.47</td>
              <td>100.00</td>
              <td>94.44</td>
              <td colspan="2">92.53</td>
              <td colspan="2">93.82</td>
              <td>Rules</td>
              <td>2</td>
            </tr>
            <tr valign="top">
              <td>KETO-1YR</td>
              <td>0.00</td>
              <td>0.00</td>
              <td>0.00</td>
              <td colspan="2">100.00</td>
              <td>100.00</td>
              <td>100.00</td>
              <td colspan="2">50.00</td>
              <td colspan="2">
                <italic>50.00</italic>
              </td>
              <td>All</td>
              <td>1</td>
            </tr>
            <tr valign="top">
              <td>MAJOR-DIABETES</td>
              <td>85.00</td>
              <td>79.07</td>
              <td>81.93</td>
              <td colspan="2">80.43</td>
              <td>86.05</td>
              <td>83.15</td>
              <td colspan="2">82.54</td>
              <td colspan="2">86.02</td>
              <td>Hybrid</td>
              <td>2</td>
            </tr>
            <tr valign="top">
              <td>MAKES-DECISIONS</td>
              <td>97.62</td>
              <td>98.80</td>
              <td>98.20</td>
              <td colspan="2">50.00</td>
              <td>33.33</td>
              <td>40.00</td>
              <td colspan="2">69.10</td>
              <td colspan="2">
                <italic>74.40</italic>
              </td>
              <td>HNN</td>
              <td>2</td>
            </tr>
            <tr valign="top">
              <td>MI-6MOS</td>
              <td>33.33</td>
              <td>50.00</td>
              <td>40.00</td>
              <td colspan="2">94.59</td>
              <td>89.74</td>
              <td>92.11</td>
              <td colspan="2">66.05</td>
              <td colspan="2">
                <italic>87.59</italic>
              </td>
              <td>Rules</td>
              <td>4</td>
            </tr>
            <tr valign="top">
              <td>Overall<sup>h</sup> (microaveraged)</td>
              <td>83.97</td>
              <td>91.29</td>
              <td>87.47</td>
              <td colspan="2">93.54</td>
              <td>87.86</td>
              <td>90.61</td>
              <td colspan="2">89.04</td>
              <td colspan="2">91.11</td>
              <td>Hybrid</td>
              <td>4</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table7fn1">
            <p><sup>a</sup>The binary classification task involves 2 classes (<italic>met</italic> and <italic>not me</italic>t). The results are provided for each class separately and then combined into the overall F value.</p>
          </fn>
          <fn id="table7fn2">
            <p><sup>b</sup>The best results from 3 related studies are used as the baseline. They are named after the approach they used: rules [<xref ref-type="bibr" rid="ref34">34</xref>], hybrid [<xref ref-type="bibr" rid="ref17">17</xref>], and HNN [<xref ref-type="bibr" rid="ref36">36</xref>]. The baseline results in italics were calculated on the basis of at most eight positive examples, which account for less than 10% of the test data.</p>
          </fn>
          <fn id="table7fn3">
            <p><sup>c</sup>c2s2: Cardiff Cohort Selection System.</p>
          </fn>
          <fn id="table7fn4">
            <p><sup>d</sup>P: precision.</p>
          </fn>
          <fn id="table7fn5">
            <p><sup>e</sup>R: recall.</p>
          </fn>
          <fn id="table7fn6">
            <p><sup>f</sup>F: <italic>F</italic> measure.</p>
          </fn>
          <fn id="table7fn7">
            <p><sup>g</sup>HNN: hierarchical neural network.</p>
          </fn>
          <fn id="table7fn8">
            <p><sup>h</sup>The overall values provided in the bottom row have been microaveraged across the 13 classifiers.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>The best results marked with an asterisk in <xref ref-type="table" rid="table7">Table 7</xref> were calculated on the basis of at most 8 positive examples, which account for less than 10% of the test data. This makes it impossible to differentiate between random and statistically significant outcomes, thus making it difficult to generalize the findings. The most extreme example is that of KETO-1YR, which had no positive examples in the test data. The results of all 4 systems were identical with no classification errors. Again, given that the training data contained only 1 positive example, the best classification strategy would be the majority rule, which would achieve the same result. Similarly, ALCOHOL-ABUSE, DRUG-ABUSE, and MAKES-DECISIONS had only 3 positive examples in the test data. On these classes, the 4 systems achieved average precision, recall, and <italic>F</italic> measure of 58.43%, 65.46%, and 59.38% with standard deviations of 40.11%, 36.51%, and 37.48%, respectively, again illustrating the difficulty of generalizing these findings. Finally, MI-6MOS had 8 positive examples. The rule-based system achieved the best performance followed by HNN. At 40.00%, the remaining 2 systems achieved a modest <italic>F</italic> measure on the <italic>met</italic> class, but they did differ in the way they balanced precision and recall. Overall, no obvious pattern could be noticed in the classification performance on this class.</p>
      <p>All 4 systems achieved similar performance for HBA<sub>1c</sub> and ASP-FOR-MI. On the <italic>met</italic> class, all 4 systems achieved maximal precision on HBA<sub>1c</sub> with recall in the 80s, resulting in an <italic>F</italic> measure just below or just above 90%. Conversely, on the <italic>met</italic> class, all 4 systems achieved almost perfect recall on ASP-FOR-MI with precision in the high 80s, resulting in an <italic>F</italic> measure over 90%. Given the consistently high performance, we infer that the 2 eligibility criteria are semantically tractable in the sense that they lend themselves to being modeled computationally.</p>
      <p>The rule-based approach performed best against the following eligibility criteria: ABDOMINAL and CREATININE. For ABDOMINAL, recall was in the 80s on the <italic>met</italic> class with no significant variation across the systems. However, the 2 machine learning approaches demonstrated markedly lower precision than the rule-based approach: 60s versus 90s. Further experiments are needed to determine whether more training data would help reduce the number of false positives. In reality, the cost and time associated with data annotation imposes an upper bound on the amount of training data available. Given the <italic>F</italic> measure is in high 80s, rule-based approaches could be a preferred option for narrowly defined eligibility criteria, which can be mapped to explicit references in text. We can observe similar results for CREATININE. The rule-based approach performed best with an <italic>F</italic> measure in the 80s on the <italic>met</italic> class, followed by our own approach with comparable performance. Although we used machine learning, the key feature used by the classifier was in fact extracted using a rule-based approach. This is consistent with our previous recommendation.</p>
      <p>Conversely, broader eligibility criteria, which require some reasoning over multiple references made across the discourse, may require a machine learning approach to model the complexities of target classification problems. MAJOR-DIABETES is one such example where major complications may not be restricted to a finite class of signs and symptoms. In addition, such complications may be mentioned without an explicit reference to diabetes. This requires complex analysis of the wider context. Neural networks can be used to model nonlinearity in text. Not surprisingly, the HNN approach achieved the best results in this case. In particular, the robustness of this approach is reflected in achieving a recall of over 90% on the <italic>met</italic> class. The rule-based approaches demonstrated lower recall. Our own approach demonstrated the lowest recall as we also used a rule-based approach to extract pertinent features. However, our use of machine learning on top of such features resulted in the second highest precision on the <italic>met</italic> class.</p>
      <p>Another example of this type of problem is ADVANCED-CAD. As expected, both machine learning approaches performed better than the other 2, with overall <italic>F</italic> measure in the 80s and 70s, respectively. In particular, our approach significantly outperformed all others in both precision and recall (see <xref ref-type="table" rid="table8">Table 8</xref>). We attribute such a performance to a suitable combination of rule-based feature extraction and supervised classification. By examining <xref ref-type="table" rid="table5">Table 5</xref>, we can see that the majority of features are related to advanced cardiovascular disease either directly (eg, HRTMED, HRTTRT, HRTISC, HRTANG, HRTCAD, and ASPFMI) or indirectly (eg, BRPMED and DMCMPL). Our approach demonstrates the degree to which domain knowledge infusion can improve the performance of machine learning when trained on a relatively small dataset. However, it does not require comprehensive knowledge elicitation. We simply used online resources and simple corpus analysis to inform the development of the corresponding lexica and regular expressions following the same approach used successfully in previous shared tasks [<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>].</p>
      <table-wrap position="float" id="table8">
        <label>Table 8</label>
        <caption>
          <p>Detailed holdout test results for ADVANCED-CAD.</p>
        </caption>
        <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
          <thead>
            <tr valign="top">
              <td rowspan="2">System</td>
              <td colspan="3">Met</td>
              <td colspan="3">Not met</td>
              <td>Overall</td>
            </tr>
            <tr valign="top">
              <td>P<sup>a</sup> (%)</td>
              <td>R<sup>b</sup> (%)</td>
              <td>F<sup>c</sup> (%)</td>
              <td>P (%)</td>
              <td>R (%)</td>
              <td>F (%)</td>
              <td>F (%)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>c2s2<sup>d</sup></td>
              <td>83.02</td>
              <td>97.78</td>
              <td>89.80</td>
              <td>96.97</td>
              <td>78.05</td>
              <td>86.49</td>
              <td>88.14</td>
            </tr>
            <tr valign="top">
              <td>Hybrid</td>
              <td>74.55</td>
              <td>91.11</td>
              <td>82.00</td>
              <td>87.10</td>
              <td>65.85</td>
              <td>75.00</td>
              <td>78.50</td>
            </tr>
            <tr valign="top">
              <td>Rules</td>
              <td>67.80</td>
              <td>88.89</td>
              <td>76.92</td>
              <td>81.48</td>
              <td>53.66</td>
              <td>64.71</td>
              <td>70.81</td>
            </tr>
            <tr valign="top">
              <td>HNN<sup>e</sup></td>
              <td>77.36</td>
              <td>91.11</td>
              <td>83.67</td>
              <td>87.88</td>
              <td>70.73</td>
              <td>78.38</td>
              <td>81.03</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table8fn1">
            <p><sup>a</sup>P: precision.</p>
          </fn>
          <fn id="table8fn2">
            <p><sup>b</sup>R: recall.</p>
          </fn>
          <fn id="table8fn3">
            <p><sup>c</sup>F: <italic>F</italic> measure.</p>
          </fn>
          <fn id="table8fn4">
            <p><sup>d</sup>c2s2: Cardiff Cohort Selection System.</p>
          </fn>
          <fn id="table8fn5">
            <p><sup>e</sup>HNN: hierarchical neural network.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>Ideally, supervised learning performs best when large training datasets with a reasonable class balance are available to extrapolate a classification model while minimizing overfitting. As we can see from the data (see <xref rid="figure3" ref-type="fig">Figure 3</xref>), this was not the case in this particular study. This is likely to be the norm in practice rather than the exception. When structured data are available to support certain eligibility criteria, there is no need for analyzing the unstructured text data. When such a need does exist, the use of supervised learning requires manual annotation of text data, which requires clinical expertise. The cost and time associated with this activity naturally imposes an upper bound on the amount of training data available. This limited amount of training data will immediately exclude approaches such as deep learning, which, in theory, could be used to extract complex relationships between words using long- and short-term memory. Therefore, the remaining choices include rule-based classification and supervised learning. Clinical trials are plagued by insufficient recruitment rates. On average, 86% of trials fail to recruit a sufficient number of patients, 85% of trials overrun because of insufficient recruitment, 37% of sites do not meet their recruitment targets, and 20% fail to recruit any patients [<xref ref-type="bibr" rid="ref54">54</xref>]. Even when sufficient numbers are initially recruited, the problem of 30% dropout rate remains. Not surprisingly, 30% of phase III trial terminations are because of recruitment failures. Owing to these recruitment concerns, one would naturally opt for supervised learning approaches as they are more robust than rule-based approaches in terms of recall. In other words, it would help identify a much larger pool of patients to potentially recruit. However, the limited amount of training data will prevent the use of longer n-grams as it would lead to document representation vectors that are long and sparse, a combination prone to overfitting. This leaves the BoW approach as the most plausible option. To compensate for the loss of context, manual feature engineering can be used to model complex relationships between words. This represents a practical compromise between rule-based and machine learning approaches. This study provides a practical example of such a hybrid approach. The development of our system incurred less than 2 person-months, while achieving performance that could boost the recruitment. The system is expected to reduce clinicians’ workload in line with the estimates reported by other studies [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>].</p>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BoW</term>
          <def>
            <p>bag-of-words</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">c2s2</term>
          <def>
            <p>Cardiff Cohort Selection System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">cTAKES</term>
          <def>
            <p>clinical Text Analysis and Knowledge Extraction System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EEG</term>
          <def>
            <p>electroencephalography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GTB</term>
          <def>
            <p>gradient tree boosting</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">HBA<sub>1c</sub></term>
          <def>
            <p>glycated hemoglobin</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">HNN</term>
          <def>
            <p>hierarchical neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ICD-9</term>
          <def>
            <p>International Classification of Diseases, Ninth Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">IE</term>
          <def>
            <p>information extraction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">IR</term>
          <def>
            <p>information retrieval</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">MedLEE</term>
          <def>
            <p>Medical Language Extraction and Encoding</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">n2c2</term>
          <def>
            <p>National natural language processing Clinical Challenge</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">NB</term>
          <def>
            <p>naïve Bayesian</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">UMLS</term>
          <def>
            <p>Unified Medical Language System</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors gratefully thank Nikola Cihoric, MD, for sharing his medical expertise, which partly informed the development of the preprocessing module.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>IS designed the system. IS and PC implemented the following modules: preprocessing, normalization, filtering, and feature extraction. DK and AB implemented the classification module. All authors were involved in the evaluation and interpretation of the results. IS drafted the manuscript. All authors reviewed and approved the manuscript for publication.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Bull</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>McKee</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mahon</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Harper</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>J</given-names>
            </name>
            <collab>CTTI Recruitment Project Team</collab>
          </person-group>
          <article-title>Clinical trials recruitment planning: a proposed framework from the Clinical Trials Transformation initiative</article-title>
          <source>Contemp Clin Trials</source>
          <year>2018</year>
          <month>03</month>
          <volume>66</volume>
          <fpage>74</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1551-7144(17)30753-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.cct.2018.01.003</pub-id>
          <pub-id pub-id-type="medline">29330082</pub-id>
          <pub-id pub-id-type="pii">S1551-7144(17)30753-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carlisle</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kimmelman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ramsay</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>MacKinnon</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Unsuccessful trial accrual and human subjects protections: an empirical analysis of recently closed trials</article-title>
          <source>Clin Trials</source>
          <year>2015</year>
          <month>02</month>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>77</fpage>
          <lpage>83</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25475878"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/1740774514558307</pub-id>
          <pub-id pub-id-type="medline">25475878</pub-id>
          <pub-id pub-id-type="pii">1740774514558307</pub-id>
          <pub-id pub-id-type="pmcid">PMC4516407</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Treweek</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lockhart</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pitkethly</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kjeldstrøm</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Johansen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Taskila</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>Sullivan</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>ED</given-names>
            </name>
          </person-group>
          <article-title>Methods to improve recruitment to randomised controlled trials: Cochrane systematic review and meta-analysis</article-title>
          <source>BMJ Open</source>
          <year>2013</year>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>pii: e002360</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://bmjopen.bmj.com/cgi/pmidlookup?view=long&#38;pmid=23396504"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2012-002360</pub-id>
          <pub-id pub-id-type="medline">23396504</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2012-002360</pub-id>
          <pub-id pub-id-type="pmcid">PMC3586125</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Unger</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vaidya</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hershman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Minasian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fleury</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Systematic review and meta-analysis of the magnitude of structural, clinical, and physician and patient barriers to cancer clinical trial participation</article-title>
          <source>J Natl Cancer Inst</source>
          <year>2019</year>
          <month>03</month>
          <day>1</day>
          <volume>111</volume>
          <issue>3</issue>
          <fpage>245</fpage>
          <lpage>55</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30856272"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jnci/djy221</pub-id>
          <pub-id pub-id-type="medline">30856272</pub-id>
          <pub-id pub-id-type="pii">5307078</pub-id>
          <pub-id pub-id-type="pmcid">PMC6410951</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mahon</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Furlong</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Uhlenbrauck</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Bull</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Barriers to trial recruitment and possible solutions</article-title>
          <source>Applied Clinical Trials</source>
          <year>2016</year>
          <volume>25</volume>
          <issue>2/3</issue>
          <fpage>20</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.appliedclinicaltrialsonline.com/barriers-clinical-trial-recruitment-and-possible-solutions-stakeholder-survey"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Parasuraman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sheridan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wickens</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A model for types and levels of human interaction with automation</article-title>
          <source>IEEE Trans Syst Man Cybern A</source>
          <year>2000</year>
          <volume>30</volume>
          <issue>3</issue>
          <fpage>286</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.1109/3468.844354</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clough</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sanderson</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the performance of information retrieval systems using test collections</article-title>
          <source>Inform Res</source>
          <year>2013</year>
          <volume>18</volume>
          <issue>2</issue>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://informationr.net/ir/18-2/paper582.html#.XabgRkYzaM8"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hersh</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Bedrick</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Test collections for electronic health record-based clinical information retrieval</article-title>
          <source>JAMIA Open</source>
          <year>2019</year>
          <fpage>ooz016</fpage>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooz016</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bozorgi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lhatoo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sahoo</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>EpiDEA: extracting structured epilepsy and seizure information from patient discharge summaries for cohort identification</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2012</year>
          <volume>2012</volume>
          <fpage>1191</fpage>
          <lpage>200</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23304396"/>
          </comment>
          <pub-id pub-id-type="medline">23304396</pub-id>
          <pub-id pub-id-type="pmcid">PMC3540531</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shivade</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Raghavan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fosler-Lussier</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Embi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Elhadad</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>A review of approaches to identifying patient phenotype cohorts using electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2014</year>
          <volume>21</volume>
          <issue>2</issue>
          <fpage>221</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24201027"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2013-001935</pub-id>
          <pub-id pub-id-type="medline">24201027</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2013-001935</pub-id>
          <pub-id pub-id-type="pmcid">PMC3932460</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kennebeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dexheimer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>McAneney</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lingren</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Solti</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Automated clinical trial eligibility prescreening: increasing the efficiency of patient identification for clinical trials in the emergency department</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2015</year>
          <month>01</month>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>166</fpage>
          <lpage>78</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25030032"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2014-002887</pub-id>
          <pub-id pub-id-type="medline">25030032</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2014-002887</pub-id>
          <pub-id pub-id-type="pmcid">PMC4433376</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Castro</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Dligach</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Finan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Can</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Abd-El-Barr</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gainer</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Shadick</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Large-scale identification of patients with cerebral aneurysms using natural language processing</article-title>
          <source>Neurology</source>
          <year>2017</year>
          <month>01</month>
          <day>10</day>
          <volume>88</volume>
          <issue>2</issue>
          <fpage>164</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27927935"/>
          </comment>
          <pub-id pub-id-type="doi">10.1212/WNL.0000000000003490</pub-id>
          <pub-id pub-id-type="medline">27927935</pub-id>
          <pub-id pub-id-type="pii">WNL.0000000000003490</pub-id>
          <pub-id pub-id-type="pmcid">PMC5224711</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Aligned-layer text search in clinical notes</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2017</year>
          <volume>245</volume>
          <fpage>629</fpage>
          <lpage>33</lpage>
          <pub-id pub-id-type="doi">10.3233/978-1-61499-830-3-629</pub-id>
          <pub-id pub-id-type="medline">29295172</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shivade</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hebert</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lopetegui</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>de Marneffe</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Fosler-Lussier</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Textual inference for eligibility criteria resolution in clinical trials</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <issue>58 Suppl</issue>
          <fpage>S211</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00201-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.09.008</pub-id>
          <pub-id pub-id-type="medline">26376462</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00201-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC4978353</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kreuzthaler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Berghold</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Secondary use of electronic health records for building cohort studies through top-down information extraction</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>02</month>
          <volume>53</volume>
          <fpage>188</fpage>
          <lpage>95</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(14)00232-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2014.10.010</pub-id>
          <pub-id pub-id-type="medline">25451102</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(14)00232-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kowatch</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Splaingard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Interactive cohort identification of sleep disorder patients using natural language processing and i2b2</article-title>
          <source>Appl Clin Inform</source>
          <year>2015</year>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>345</fpage>
          <lpage>63</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26171080"/>
          </comment>
          <pub-id pub-id-type="doi">10.4338/ACI-2014-11-RA-0106</pub-id>
          <pub-id pub-id-type="medline">26171080</pub-id>
          <pub-id pub-id-type="pmcid">PMC4493335</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jonnalagadda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Adupa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Garg</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Corona-Cox</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Text mining of the electronic health record: an information extraction approach for automated identification and subphenotyping of HFpEF patients for clinical trials</article-title>
          <source>J Cardiovasc Transl Res</source>
          <year>2017</year>
          <month>06</month>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>313</fpage>
          <lpage>21</lpage>
          <pub-id pub-id-type="doi">10.1007/s12265-017-9752-2</pub-id>
          <pub-id pub-id-type="medline">28585184</pub-id>
          <pub-id pub-id-type="pii">10.1007/s12265-017-9752-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vydiswaran</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Strayhorn</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bagazinski</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Essiet</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iott</surname>
              <given-names>BE</given-names>
            </name>
            <name name-style="western">
              <surname>Joo</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ko</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>JX</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Murali</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sasagawa</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Hybrid bag of approaches to characterize selection criteria for cohort identification</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2019</year>
          <month>11</month>
          <day>1</day>
          <volume>26</volume>
          <issue>11</issue>
          <fpage>1172</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz079</pub-id>
          <pub-id pub-id-type="medline">31197354</pub-id>
          <pub-id pub-id-type="pii">5518584</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Richardson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>El-Serag</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Davila</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Validation of case finding algorithms for hepatocellular cancer from administrative data and electronic health records using natural language processing</article-title>
          <source>Med Care</source>
          <year>2016</year>
          <month>02</month>
          <volume>54</volume>
          <issue>2</issue>
          <fpage>e9</fpage>
          <lpage>14</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23929403"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MLR.0b013e3182a30373</pub-id>
          <pub-id pub-id-type="medline">23929403</pub-id>
          <pub-id pub-id-type="pmcid">PMC3875602</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Birman-Deych</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Waterman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Nilasena</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gage</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of ICD-9-CM codes for identifying cardiovascular and stroke risk factors</article-title>
          <source>Med Care</source>
          <year>2005</year>
          <month>05</month>
          <volume>43</volume>
          <issue>5</issue>
          <fpage>480</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="doi">10.1097/01.mlr.0000160417.39497.a9</pub-id>
          <pub-id pub-id-type="medline">15838413</pub-id>
          <pub-id pub-id-type="pii">00005650-200505000-00009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gundlapalli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>South</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Phansalkar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kinney</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Delisle</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Perl</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Samore</surname>
              <given-names>MH</given-names>
            </name>
          </person-group>
          <article-title>Application of natural language processing to VA electronic health records to identify phenotypic characteristics for clinical and research purposes</article-title>
          <source>Summit Transl Bioinform</source>
          <year>2008</year>
          <month>03</month>
          <day>1</day>
          <volume>2008</volume>
          <fpage>36</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21347124"/>
          </comment>
          <pub-id pub-id-type="medline">21347124</pub-id>
          <pub-id pub-id-type="pmcid">PMC3041527</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rashid</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Koblick</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>GD</given-names>
            </name>
            <name name-style="western">
              <surname>Cheetham</surname>
              <given-names>TC</given-names>
            </name>
          </person-group>
          <article-title>Using natural language processing and machine learning to identify gout flares from electronic clinical notes</article-title>
          <source>Arthritis Care Res (Hoboken)</source>
          <year>2014</year>
          <month>11</month>
          <volume>66</volume>
          <issue>11</issue>
          <fpage>1740</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1002/acr.22324"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/acr.22324</pub-id>
          <pub-id pub-id-type="medline">24664671</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chase</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Comparing ICD9-encoded diagnoses and NLP-processed discharge summaries for clinical trials pre-screening: a case study</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2008</year>
          <month>11</month>
          <day>6</day>
          <fpage>404</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18999285"/>
          </comment>
          <pub-id pub-id-type="medline">18999285</pub-id>
          <pub-id pub-id-type="pmcid">PMC2656007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Overhage</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Haddad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Waters</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Aguilar-Saavedra</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kesterson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparing methods for identifying pancreatic cancer patients using electronic data sources</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2010</year>
          <month>11</month>
          <day>13</day>
          <volume>2010</volume>
          <fpage>237</fpage>
          <lpage>41</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21346976"/>
          </comment>
          <pub-id pub-id-type="medline">21346976</pub-id>
          <pub-id pub-id-type="pmcid">PMC3041435</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Mani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Extracting and integrating data from entire electronic health records for detecting colorectal cancer cases</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2011</year>
          <volume>2011</volume>
          <fpage>1564</fpage>
          <lpage>72</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22195222"/>
          </comment>
          <pub-id pub-id-type="medline">22195222</pub-id>
          <pub-id pub-id-type="pmcid">PMC3243156</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Danforth</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Early</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ngan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kosco</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gould</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Automated identification of patients with pulmonary nodules in an integrated health system using administrative health plan data, radiology reports, and natural language processing</article-title>
          <source>J Thorac Oncol</source>
          <year>2012</year>
          <month>08</month>
          <volume>7</volume>
          <issue>8</issue>
          <fpage>1257</fpage>
          <lpage>62</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1556-0864(15)32691-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JTO.0b013e31825bd9f5</pub-id>
          <pub-id pub-id-type="medline">22627647</pub-id>
          <pub-id pub-id-type="pii">S1556-0864(15)32691-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC3443078</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bielinski</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pathak</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Carrell</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Takahashi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Olson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Larson</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wells</surname>
              <given-names>QS</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Rasmussen-Torvik</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Pacheco</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Lesnick</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Gullerud</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Decker</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Pereira</surname>
              <given-names>NL</given-names>
            </name>
            <name name-style="western">
              <surname>Ryu</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Dart</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Peissig</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Linneman</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Jarvik</surname>
              <given-names>GP</given-names>
            </name>
            <name name-style="western">
              <surname>Larson</surname>
              <given-names>EB</given-names>
            </name>
            <name name-style="western">
              <surname>Bock</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Tromp</surname>
              <given-names>GC</given-names>
            </name>
            <name name-style="western">
              <surname>de Andrade</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Roger</surname>
              <given-names>VL</given-names>
            </name>
          </person-group>
          <article-title>A robust e-Epidemiology tool in phenotyping heart failure with differentiation for preserved and reduced ejection fraction: the electronic medical records and genomics (emerge) network</article-title>
          <source>J Cardiovasc Transl Res</source>
          <year>2015</year>
          <month>11</month>
          <volume>8</volume>
          <issue>8</issue>
          <fpage>475</fpage>
          <lpage>83</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26195183"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s12265-015-9644-2</pub-id>
          <pub-id pub-id-type="medline">26195183</pub-id>
          <pub-id pub-id-type="pii">10.1007/s12265-015-9644-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC4651838</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Corey</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kartoun</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Development and validation of an algorithm to identify nonalcoholic fatty liver disease in the electronic medical record</article-title>
          <source>Dig Dis Sci</source>
          <year>2016</year>
          <month>03</month>
          <volume>61</volume>
          <issue>3</issue>
          <fpage>913</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26537487"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10620-015-3952-x</pub-id>
          <pub-id pub-id-type="medline">26537487</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10620-015-3952-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC4761309</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodwin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Harabagiu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Multi-modal patient cohort identification from EEG report and signal data</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>1794</fpage>
          <lpage>803</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28269938"/>
          </comment>
          <pub-id pub-id-type="medline">28269938</pub-id>
          <pub-id pub-id-type="pmcid">PMC5333290</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Alderson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Austin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cimino</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A general natural-language text processor for clinical radiology</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>1994</year>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>161</fpage>
          <lpage>74</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/7719797"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.1994.95236146</pub-id>
          <pub-id pub-id-type="medline">7719797</pub-id>
          <pub-id pub-id-type="pmcid">PMC116194</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Masanz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ogren</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kipper-Schuler</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>507</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20819853"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id>
          <pub-id pub-id-type="medline">20819853</pub-id>
          <pub-id pub-id-type="pii">17/5/507</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995668</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pathak</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bailey</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Beebe</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bethard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Carrell</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Dligach</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Endle</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Hart</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Haug</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Huff</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Kaggal</surname>
              <given-names>VC</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Marchant</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Masanz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Oniki</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Palmer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Rea</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Stancl</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Solbrig</surname>
              <given-names>HR</given-names>
            </name>
            <name name-style="western">
              <surname>Suesse</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Westberg</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhuo</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Normalization and standardization of electronic health records for high-throughput phenotyping: the SHARPn consortium</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2013</year>
          <month>12</month>
          <volume>20</volume>
          <issue>e2</issue>
          <fpage>e341</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24190931"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2013-001939</pub-id>
          <pub-id pub-id-type="medline">24190931</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2013-001939</pub-id>
          <pub-id pub-id-type="pmcid">PMC3861933</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ravikumar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wagholikar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jonnalagadda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Juhn</surname>
              <given-names>YJ</given-names>
            </name>
          </person-group>
          <article-title>Automated chart review for asthma cohort identification using natural language processing: an exploratory study</article-title>
          <source>Ann Allergy Asthma Immunol</source>
          <year>2013</year>
          <month>11</month>
          <volume>111</volume>
          <issue>5</issue>
          <fpage>364</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24125142"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.anai.2013.07.022</pub-id>
          <pub-id pub-id-type="medline">24125142</pub-id>
          <pub-id pub-id-type="pii">S1081-1206(13)00521-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC3839107</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenbloom</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Giuse</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A comparative study of current Clinical Natural Language Processing systems on handling abbreviations in discharge summaries</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2012</year>
          <volume>2012</volume>
          <fpage>997</fpage>
          <lpage>1003</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23304375"/>
          </comment>
          <pub-id pub-id-type="medline">23304375</pub-id>
          <pub-id pub-id-type="pmcid">PMC3540461</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chopard</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Spasic</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Martín-Vide</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Purver</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pollak</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A deep learning approach to self-expansion of abbreviations based on morphology and context distance</article-title>
          <source>Statistical Language and Speech Processing</source>
          <year>2019</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>71</fpage>
          <lpage>82</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lou</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Clinical trial cohort selection based on multi-level rule-based natural language processing system</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2019</year>
          <month>11</month>
          <day>1</day>
          <volume>26</volume>
          <issue>11</issue>
          <fpage>1218</fpage>
          <lpage>26</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz109</pub-id>
          <pub-id pub-id-type="medline">31300825</pub-id>
          <pub-id pub-id-type="pii">5531899</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Buntrock</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Prospective recruitment of patients with congestive heart failure using an ad-hoc binary classifier</article-title>
          <source>J Biomed Inform</source>
          <year>2005</year>
          <month>04</month>
          <volume>38</volume>
          <issue>2</issue>
          <fpage>145</fpage>
          <lpage>53</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(04)00167-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2004.11.016</pub-id>
          <pub-id pub-id-type="medline">15797003</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(04)00167-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Cohort selection for clinical trials using hierarchical neural network</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2019</year>
          <month>11</month>
          <day>1</day>
          <volume>26</volume>
          <issue>11</issue>
          <fpage>1203</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz099</pub-id>
          <pub-id pub-id-type="medline">31305921</pub-id>
          <pub-id pub-id-type="pii">5532320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wolpert</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The lack of a priori distinctions between learning algorithms</article-title>
          <source>Neural Comput</source>
          <year>1996</year>
          <volume>8</volume>
          <issue>7</issue>
          <fpage>1341</fpage>
          <lpage>90</lpage>
          <pub-id pub-id-type="doi">10.1162/neco.1996.8.7.1341</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maguen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Madden</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Patterson</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>DuVall</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Burkman</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Shiner</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Measuring use of evidence based psychotherapy for posttraumatic stress disorder in a large national healthcare system</article-title>
          <source>Adm Policy Ment Health</source>
          <year>2018</year>
          <month>07</month>
          <volume>45</volume>
          <issue>4</issue>
          <fpage>519</fpage>
          <lpage>29</lpage>
          <pub-id pub-id-type="doi">10.1007/s10488-018-0850-5</pub-id>
          <pub-id pub-id-type="medline">29450781</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10488-018-0850-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kotfila</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>A systematic comparison of feature space effects on disease classifier performance for phenotype identification of five diseases</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <issue>58 Suppl</issue>
          <fpage>S92</fpage>
          <lpage>102</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00156-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.07.016</pub-id>
          <pub-id pub-id-type="medline">26241355</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00156-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC4994187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Perentesis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lingren</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Deleger</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Solti</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Increasing the efficiency of trial-patient matching: automated clinical trial eligibility pre-screening for pediatric oncology patients</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2015</year>
          <month>04</month>
          <day>14</day>
          <volume>15</volume>
          <fpage>28</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-015-0149-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-015-0149-3</pub-id>
          <pub-id pub-id-type="medline">25881112</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-015-0149-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC4407835</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bermudez</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kennebeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liddy-Hicks</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dexheimer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A real-time automated patient screening system for clinical trials eligibility in an emergency department: design and evaluation</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>07</month>
          <day>24</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>e14185</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/3/e14185/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/14185</pub-id>
          <pub-id pub-id-type="medline">31342909</pub-id>
          <pub-id pub-id-type="pii">v7i3e14185</pub-id>
          <pub-id pub-id-type="pmcid">PMC6685132</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <source>GitHub</source>
          <year>2019</year>
          <access-date>2019-10-15</access-date>
          <comment>c2s2 <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/dokato/c2s2">https://github.com/dokato/c2s2</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Creation of a new longitudinal corpus of clinical narratives</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <issue>58 Suppl</issue>
          <fpage>S6</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00212-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.09.018</pub-id>
          <pub-id pub-id-type="medline">26433122</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00212-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC4978168</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kotfila</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Identifying risk factors for heart disease over time: overview of 2014 i2b2/UTHealth shared task Track 2</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <issue>58 Suppl</issue>
          <fpage>S67</fpage>
          <lpage>77</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00140-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.07.001</pub-id>
          <pub-id pub-id-type="medline">26210362</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00140-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC4978189</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Griffis</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Shivade</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Fosler-Lussier</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A quantitative and qualitative evaluation of sentence boundary detection for the clinical domain</article-title>
          <source>AMIA Jt Summits Transl Sci Proc</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>88</fpage>
          <lpage>97</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27570656"/>
          </comment>
          <pub-id pub-id-type="medline">27570656</pub-id>
          <pub-id pub-id-type="pmcid">PMC5001746</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="web">
          <source>TextWorld</source>
          <year>2019</year>
          <access-date>2019-10-15</access-date>
          <comment>Simple Concordance Program <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.textworld.com/scp/">http://www.textworld.com/scp/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Bridewell</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hanbury</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Buchanan</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A simple algorithm for identifying negated findings and diseases in discharge summaries</article-title>
          <source>J Biomed Inform</source>
          <year>2001</year>
          <month>10</month>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>301</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(01)91029-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1006/jbin.2001.1029</pub-id>
          <pub-id pub-id-type="medline">12123149</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(01)91029-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="web">
          <source>scikit-learn</source>
          <year>2019</year>
          <access-date>2019-10-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://scikit-learn.org/">https://scikit-learn.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="web">
          <source>XGBoost Documentation</source>
          <year>2019</year>
          <access-date>2019-10-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://xgboost.readthedocs.io/">https://xgboost.readthedocs.io/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spasic</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Sarafraz</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Keane</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nenadic</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Medication information extraction with linguistic pattern matching and semantic rules</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>532</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20819858"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2010.003657</pub-id>
          <pub-id pub-id-type="medline">20819858</pub-id>
          <pub-id pub-id-type="pii">17/5/532</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995671</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spasic</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Burnap</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Greenwood</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Arribas-Ayllon</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A naïve bayes approach to classifying topics in suicide notes</article-title>
          <source>Biomed Inform Insights</source>
          <year>2012</year>
          <volume>5</volume>
          <issue>Suppl 1</issue>
          <fpage>87</fpage>
          <lpage>97</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22879764"/>
          </comment>
          <pub-id pub-id-type="doi">10.4137/BII.S8945</pub-id>
          <pub-id pub-id-type="medline">22879764</pub-id>
          <pub-id pub-id-type="pii">bii-suppl-1-2012-087</pub-id>
          <pub-id pub-id-type="pmcid">PMC3409485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nuttall</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>Vert Asset Management</source>
          <year>2012</year>
          <access-date>2019-10-16</access-date>
          <comment>Considerations For Improving Patient Recruitment Into Clinical Trials <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://vertassets.blob.core.windows.net/download/64c39d7e/64c39d7e-c643-457b-aec2-9ff7b65b3ad2/rdprecruitmentwhitepaper.pdf">http://vertassets.blob.core.windows.net/download/64c39d7e/64c39d7e-c643-457b-aec2-9ff7b65b3ad2/rdprecruitmentwhitepaper.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
