<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i4e14782</article-id>
      <article-id pub-id-type="pmid">31845899</article-id>
      <article-id pub-id-type="doi">10.2196/14782</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Efficient Reuse of Natural Language Processing Models for Phenotype-Mention Identification in Free-text Electronic Medical Records: A Phenotype Embedding Approach</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Vydiswaran</surname>
            <given-names>Vinod</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Polepalli Ramesh</surname>
            <given-names>Balaji</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Wu</surname>
            <given-names>Honghan</given-names>
          </name>
          <degrees>BEng, DPhil</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Centre for Medical Informatics</institution>
            <institution>Usher Institute</institution>
            <institution>University of Edinburgh</institution>
            <addr-line>9 Little France Road</addr-line>
            <addr-line>Edinburgh, EH16 4UX</addr-line>
            <country>United Kingdom</country>
            <phone>44 01316517882</phone>
            <email>honghan.wu@ed.ac.uk</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0213-5668</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Hodgson</surname>
            <given-names>Karen</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9433-7071</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Dyson</surname>
            <given-names>Sue</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9649-5591</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Morley</surname>
            <given-names>Katherine I</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2725-5535</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Ibrahim</surname>
            <given-names>Zina M</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6203-2727</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Iqbal</surname>
            <given-names>Ehtesham</given-names>
          </name>
          <degrees>BEng</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9477-9745</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Stewart</surname>
            <given-names>Robert</given-names>
          </name>
          <degrees>MD, DPhil</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4435-6397</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Dobson</surname>
            <given-names>Richard JB</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4224-9245</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Sudlow</surname>
            <given-names>Cathie</given-names>
          </name>
          <degrees>MD, DPhil</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7725-7520</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Centre for Medical Informatics</institution>
        <institution>Usher Institute</institution>
        <institution>University of Edinburgh</institution>
        <addr-line>Edinburgh</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>School of Computer and Software</institution>
        <institution>Nanjing University of Information Science and Technology</institution>
        <addr-line>Nanjing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Health Data Research UK</institution>
        <institution>University of Edinburgh</institution>
        <addr-line>Edinburgh</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Psychosis Studies</institution>
        <institution>Institute of Psychiatry, Psychology &#38; Neuroscience</institution>
        <institution>King’s College London</institution>
        <addr-line>London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>South London and Maudsley NHS Foundation Trust</institution>
        <addr-line>London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Centre for Epidemiology and Biostatistics</institution>
        <institution>Melbourne School of Global and Population Health</institution>
        <institution>The University of Melbourne</institution>
        <addr-line>Melbourne</addr-line>
        <country>Australia</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Health Data Research UK</institution>
        <institution>University College London</institution>
        <addr-line>London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Honghan Wu <email>honghan.wu@ed.ac.uk</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Oct-Dec</season>
        <year>2019</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>12</month>
        <year>2019</year>
      </pub-date>
      <volume>7</volume>
      <issue>4</issue>
      <elocation-id>e14782</elocation-id>
      <history>
        <date date-type="received">
          <day>22</day>
          <month>5</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>3</day>
          <month>10</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>8</day>
          <month>10</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>22</day>
          <month>10</month>
          <year>2019</year>
        </date>
      </history>
      <copyright-statement>©Honghan Wu, Karen Hodgson, Sue Dyson, Katherine I Morley, Zina M Ibrahim, Ehtesham Iqbal, Robert Stewart, Richard JB Dobson, Cathie Sudlow. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 17.12.2019.</copyright-statement>
      <copyright-year>2019</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2019/4/e14782/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Much effort has been put into the use of automated approaches, such as natural language processing (NLP), to mine or extract data from free-text medical records in order to construct comprehensive patient profiles for delivering better health care. Reusing NLP models in new settings, however, remains cumbersome, as it requires validation and retraining on new data iteratively to achieve convergent results.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this work is to minimize the effort involved in reusing NLP models on free-text medical records.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We formally define and analyze the model adaptation problem in phenotype-mention identification tasks. We identify “duplicate waste” and “imbalance waste,” which collectively impede efficient model reuse. We propose a phenotype embedding–based approach to minimize these sources of waste without the need for labelled data from new settings.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We conduct experiments on data from a large mental health registry to reuse NLP models in four phenotype-mention identification tasks. The proposed approach can choose the best model for a new task, identifying up to 76% waste (duplicate waste), that is, phenotype mentions without the need for validation and model retraining and with very good performance (93%-97% accuracy). It can also provide guidance for validating and retraining the selected model for novel language patterns in new tasks, saving around 80% waste (imbalance waste), that is, the effort required in “blind” model-adaptation approaches.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Adapting pretrained NLP models for new tasks can be more efficient and effective if the language pattern landscapes of old settings and new settings can be made explicit and comparable. Our experiments show that the phenotype-mention embedding approach is an effective way to model language patterns for phenotype-mention identification tasks and that its use can guide efficient NLP model reuse.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>text mining</kwd>
        <kwd>phenotype</kwd>
        <kwd>word embedding</kwd>
        <kwd>phenotype embedding</kwd>
        <kwd>model adaptation</kwd>
        <kwd>electronic health records</kwd>
        <kwd>machine learning</kwd>
        <kwd>clustering</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Compared to structured components of electronic health records (EHRs), free-text comprises a much deeper and larger volume of health data. For example, in a recent geriatric syndrome study [<xref ref-type="bibr" rid="ref1">1</xref>], unstructured EHR data contributed a significant proportion of identified cases: 67.9% cases of falls, 86.6% cases of visual impairment, and 99.8% cases of lack of social support. Similarly, in a study of comorbidities using a database of anonymized EHRs of a psychiatric hospital in London (the South London and Maudsley NHS Foundation Trust [SLaM]) [<xref ref-type="bibr" rid="ref2">2</xref>], 1899 cases of comorbid depression and type 2 diabetes were identified from unstructured EHRs, while only 19 cases could be found using structured diagnosis tables. The value of unstructured records for selecting cohorts has also been widely reported [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Extracting clinical variables or identifying phenotypes from unstructured EHR data is, therefore, essential for addressing many clinical questions and research hypotheses [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      <p>Automated approaches are essential to surface such deep data from free-text clinical notes at scale. To make natural language processing (NLP) tools accessible for clinical applications, various approaches have been proposed, including generic, user-friendly tools [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>] and Web services or cloud-based solutions [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Among these approaches, perhaps, the most efficient way to facilitate clinical NLP projects is to adapt pretrained NLP models in new but similar settings [<xref ref-type="bibr" rid="ref14">14</xref>], that is, to reuse existing NLP solutions to answer new questions or to work on new data sources. However, it is very often burdensome to reuse pretrained NLP models. This is mainly because NLP models essentially abstract language patterns (ie, language characteristics representable in computable form) and subsequently use them for prediction or classification tasks. These patterns are prone to change when the document set (corpus) or the text mining task (what to look up) changes. Unfortunately, when it comes to a new setting, it is uncertain which patterns have and have not changed. Therefore, in practice, random samples are drawn to validate the performance of an existing NLP model in a new setting and subsequently to plan the adaptation of the model based on the validation results.</p>
      <p>Such “<italic>blind</italic>” adaptation is costly in the clinical domain because of barriers to data access and expensive clinical expertise needed for data labelling. The <italic>“blindness</italic>” to the similarities and differences of language pattern landscapes between the source (where the model was trained) and target (the new task) settings causes (at least) two types of potentially unnecessary, wasted effort, which may be avoidable. First, for data in the target setting with the same patterns as in the source setting, any validation or retraining efforts are unnecessary because the model has already been trained and validated on these language patterns. We call this type of wasted effort the “<italic>duplicate waste</italic>.” The second type of <italic>waste</italic> occurs if the distribution of new language patterns in the target setting is unbalanced, that is, some—but not all—data instances belong to different language patterns. The model adaptation involves validating the model on these new data and further adjusting it when performance is not good enough. Without the knowledge of which data instances belong to which language patterns, data instances have to be randomly sampled for validation and adaptation. In most cases, a minimal number of instances of every pattern need to be processed, so that convergent results can be obtained. This will usually be achieved via iterative validation and adaptation process, which will inevitably cause commonly used language patterns to be over represented, resulting in the model being over validated/retrained on such data. Such unnecessary efforts on commonly used language patterns result from the pattern imbalance in the target setting, which unfortunately is the norm in almost all real-world EHR datasets. We call this “<italic>imbalance waste.</italic>”</p>
      <p>The ability to make language patterns <italic>visible</italic> and comparable will address whether an NLP model can be adapted to a new task and, importantly, provide guidance on how to solve new problems effectively and efficiently through the <italic>smart</italic> adaptation of existing models. In this paper, we introduce a contextualized embedding model to <italic>visualize</italic> such patterns and provide guidance for reusing NLP models in phenotype-mention identification tasks. Here, a phenotype mention denotes an appearance of a word or phrase (representing a medical concept) in a document, which indicates a phenotype related to a person. We note two aspects of this definition:</p>
      <list list-type="order">
        <list-item>
          <p>Phenotype mention ≠ Medical concept mention. When a medical concept mentioned in a document does not indicate a phenotype relating to a person (eg, cases in the last two rows of Table 1), it is not a phenotype mention.</p>
        </list-item>
        <list-item>
          <p>Phenotype mention ≠ Phenotype. Phenotype (eg, diseases and associated traits) is a specific patient characteristic [<xref ref-type="bibr" rid="ref15">15</xref>] and a patient-level feature, (eg, a binary value indicating whether a patient is a smoker). However, for the same phenotype, a patient might have multiple phenotype mentions. For example, “xxx is a smoker” could be mentioned in different documents or even multiple times in one document, and each of these appearances is a phenotype mention.
    </p>
        </list-item>
      </list>
      <p/>
      <p>The focus of this work is to minimize the effort in reusing existing NLP model(s) in solving new tasks rather than proposing a novel NLP model for phenotype-mention identification. We aim to address the problem of NLP model transferability in the task of extracting mentions of phenotypes from free-text medical records. Specifically, the task is to identify the above-defined phenotype mentions and the contexts in which they were mentioned [<xref ref-type="bibr" rid="ref10">10</xref>]. <xref ref-type="table" rid="table1">Table 1</xref> explains and provides examples of contextualized phenotype mentions. The research question to be investigated is formally defined as mentioned in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> and illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>The task of recognizing contextualized phenotype mentions is to identify mentions of phenotypes from free-text records and classify the context of each mention into five categories (listed in the second column of Table 1). The last two rows give examples of nonphenotype mentions—the two sentences are not describing incidents of a condition.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="627"/>
          <col width="373"/>
          <thead>
            <tr valign="top">
              <td>Examples</td>
              <td>Types of phenotype mentions</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>49 year old man with <italic>hepatitis c</italic></td>
              <td>Positive mention<sup>a</sup></td>
            </tr>
            <tr valign="top">
              <td>With no evidence of <italic>cancer recurrence</italic></td>
              <td>Negated mention<sup>a</sup></td>
            </tr>
            <tr valign="top">
              <td>…Is concerning for local <italic>lung cancer recurrence</italic></td>
              <td>Hypothetical mention<sup>a</sup></td>
            </tr>
            <tr valign="top">
              <td>PAST MEDICAL HISTORY: (1) <italic>Atrial Fibrillation</italic>, (2)...</td>
              <td>History mention<sup>a</sup></td>
            </tr>
            <tr valign="top">
              <td>Mother was A positive, <italic>hepatitis C carrier</italic>, and...</td>
              <td>Mention of phenotype in another person<sup>a</sup></td>
            </tr>
            <tr valign="top">
              <td>She visited the <italic>HIV</italic> clinic last week.</td>
              <td>Not a phenotype mention</td>
            </tr>
            <tr valign="top">
              <td>The patient asked for information about <italic>stroke</italic>.</td>
              <td>Not a phenotype mention</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>Contextualized mentions.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <boxed-text id="box1" position="float">
        <title>Research question.</title>
        <p>Definition 1. Given an natural language processing model (denoted as ) previously trained for some phenotype-mention identification task(s), and a new task (denoted as , where either phenotypes to be identified are new or the dataset is new, or both are new), <italic>m</italic> is used in to identify a set of phenotype mentions—denoted as <italic>S</italic>. The research question is how to partition <italic>S</italic> to meet the following criteria:</p>
        <list list-type="order">
          <list-item>
            <p>A maximum p-known subset Sknown where m’s performance can be properly predicted using prior knowledge of m;</p>
          </list-item>
          <list-item>
            <p>p-unknown subsets: {Su1, Su2…Suk}, which meet the following criteria:</p>
          </list-item>
        </list>
        <p>
          <graphic xlink:href="medinform_v7i4e14782_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
      </boxed-text>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Assess the transferability of a pretrained model in solving a new task: Discriminate between differently inaccurate mentions identified by the model in the new setting.</p>
        </caption>
        <graphic xlink:href="medinform_v7i4e14782_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>The identification of “<italic>p-known</italic>” subset (criterion 1) will help eliminate <italic>“duplicate waste”</italic> by avoiding unnecessary validation and adaptation on those phenotype mentions. On the other hand, separating the rest of the annotations into “<italic>p-unknown</italic>” subsets allows processing mentions based on their <italic>performance-relevant</italic> characteristics separately, which in turn helps avoid “<italic>imbalance waste.</italic>” The abovementioned criterion 2a ensures completeness of coverage of all performance-unknown mentions and criterion 2b ensures no overlaps between mention subsets, so that no duplicated effort will be put on the same mentions. Criterion 2c requires that the partitioning of the mentions is <italic>performance-relevant</italic>, meaning that model performance on a small number of samples can be generalized to the whole subset that they are drawn from. Lastly, a small (criterion 2d) enables efficient adaptation of a model.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Dataset and Adaptable Phenotype-Mention Identification Models</title>
        <p>Recently, we developed SemEHR [<xref ref-type="bibr" rid="ref10">10</xref>]—a semantic search toolkit aiming to use interactive information retrieval functionalities to replace NLP building, so that clinical researchers can use a browser-based interface to access text mining results from a generic NLP model and (optionally) keep getting better results by iteratively feeding them back to the system. A SLaM instance of this system has been trained for supporting six comorbidity studies (62,719 patients and 17,479,669 clinical notes in total), where different combinations of physical conditions and mental disorders are extracted and analyzed. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides details about the user interface and model performance. These studies effectively generated 23 phenotype-mention identification models and relevant labelled data (&#62;7000 annotated documents), which we use to study model transferability.</p>
      </sec>
      <sec>
        <title>Foundation of the Proposed Approach</title>
        <p>Our approach is based on the following assumption about a language pattern representation model:</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>Assumption 1.</italic> There exists a pattern representation model, A, for identifying language patterns of phenotype mentions with the following characteristics:</p>
            <list list-type="order">
              <list-item>
                <p>Each phenotype mention can be characterized by only one language pattern.</p>
              </list-item>
              <list-item>
                <p>Patterns are largely shared by different mentions.</p>
              </list-item>
              <list-item>
                <p>There is a deterministic association between NLP models’ performances with such language patterns.</p>
              </list-item>
            </list>
          </list-item>
        </list>
        <list list-type="bullet">
          <list-item>
            <p><italic>Theorem 1.</italic> Given A, a pattern model meeting Assumption 1, m—an NLP model, T—a new task, let Pm be the pattern set Aidentifies from dataset(s) that m was trained or validated on; let PT be the pattern set A identifies from Sthe set of all mentions identified by m in T. Then, the problem defined in Definition 1 can be solved by a solution, where Pm ∩ PTis the “p-known” subset and PT – Pm ∩ PT is “p-unknown” subsets.</p>
          </list-item>
        </list>
        <p>Proof of Theorem 1 can be found in the <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. The rest of this section provides details of a realization of <italic>A</italic>using distributed representation models.</p>
      </sec>
      <sec>
        <title>Distributed Representation for Contextualized Phenotype Mentions</title>
        <p>In computational linguistics, statistical language models are, perhaps, the most common approach to quantify word sequences, where a distribution is used to represent the probability of a sequence of words: <italic>P</italic>(<italic>w<sub>1</sub></italic>…<italic>w<sub>n</sub></italic>). Among such models, the bag-of-words (BOW) model [<xref ref-type="bibr" rid="ref15">15</xref>] is perhaps the earliest and simplest, yet widely used and efficient in certain tasks [<xref ref-type="bibr" rid="ref16">16</xref>]. To overcome BOW’s limitations (eg, ignoring semantic similarities between words), more complex models were introduced to represent word semantics [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. Probably, the most popular alternative is the distributed representation model [<xref ref-type="bibr" rid="ref20">20</xref>], which uses a vector space to model words, so that word similarities can be represented as distances between their vectors. This concept has since been extensively followed up, extended, and shown to significantly improve NLP tasks [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref26">26</xref>].</p>
        <p>In original distributed representation models, the semantics of one word is encoded in one single vector, which makes it impossible to disambiguate different semantics or contexts that one word might be used for in a corpus. Recently, various (bidirectional) long short-term memory models were proposed to learn contextualized word vectors [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. However, such linguistic contexts are not the phenotype contexts (<xref ref-type="table" rid="table1">Table 1</xref>) that we seek in this paper.</p>
        <p>Inspired by the good properties of distributed representations for words, we propose a phenotype encoding approach that aims to model the language patterns of contextualized phenotype mentions. Compared to word semantics, phenotype semantics are represented in a larger context, at the sentence or even paragraph level (eg, <italic>he worries about contracting HIV</italic>; here, HIV is a hypothetical phenotype mention). The key idea of our approach is to use explicit mark-ups to represent phenotype semantics in the text, so that they can be learned through an approach similar to the word embedding learning framework.</p>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> illustrates our framework for extending the continuous BOW word embedding architecture to capture the semantics of contextualized phenotype mentions. Explicit <italic>mark-ups of phenotype mentions</italic> are added to the architecture as placeholders for phenotype semantics. A mark-up (eg, C0038454_POS) is composed of two parts: phenotype identification (eg, C0038454) and contextual description (eg, POS). The first part identifies a phenotype using a standardized vocabulary. In our implementation, the Unified Medical Language System (UMLS) [<xref ref-type="bibr" rid="ref30">30</xref>] was chosen for its broad concept coverage and the provision of comprehensive synonyms for concepts. The first benefit of using a standardized phenotype definition is that it helps in grouping together mentions of the same phenotype using different names. For example, using UMLS concept identification of C0038454 for STROKE helps combining together mentions using <italic>Stroke, Cerebrovascular Accident, Brain Attack</italic>, and other 43 synonyms. The second benefit is from the concept relations represented in the vocabulary hierarchy, which helps the transferability computation that we will elaborate on later (step 3 in the next subsection). The second part of a phenotype mention mark-up is to identify the mention context. Six types of contexts are supported: POS for <italic>positive mention</italic>, NEG for <italic>negated mention</italic>, HYP for <italic>hypothetical mention</italic>, HIS for <italic>history mention</italic>, OTH for <italic>mention of the phenotype in another person</italic>, and NOT for <italic>not a phenotype mention</italic>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The framework to learn contextualized phenotype embedding from labelled data that an natural language processing model m was trained or validated on. TIA: transient ischemic attack.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e14782_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The <italic>phenotype mention mark-ups</italic> can be populated using labelled data that NLP models were trained or validated on. In our implementation, the mark-ups were generated from the labelled subset of SLaM EHRs.</p>
      </sec>
      <sec>
        <title>Using Phenotype Embedding and Their Semantics for Assessing Model Transferability</title>
        <p>The embeddings learned (including both word and contextualized phenotype vectors) are the building blocks underlying the language pattern representation model—<italic>A</italic>, as introduced at the beginning of this section, which is to compute <italic>P<sub>m</sub></italic> (the landscape of language patterns that <italic>m</italic> is familiar with) and <italic>P<sub>T</sub></italic> (the landscape of language patterns in the new task <italic>T</italic>) for assessing and guiding NLP model adaptation for new tasks.</p>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> illustrates the architecture of our approach. The double-circle shape denotes the embeddings learned from <italic>m</italic>’s labelled data. Essentially, the process is composed of two phases: (1) the documents from a new task (on the left of the figure) are annotated with phenotype mentions using a pretrained model <italic>m</italic> and (2) a classification task uses the abovementioned embeddings to assess each mention—whether it is an instance of <italic>p-known</italic> (something similar enough to what <italic>m</italic> is familiar with) or any subset of <italic>p-unknown</italic> (something that is new to <italic>m</italic>). Specifically, the process is composed of the following steps:</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Architecture of phenotype embedding-based approach for transferring pretrained natural language processing models for identifying new phenotypes or application to new corpora. The word and phenotype embedding model is learned from the training data of the reusable models in its source domain (the task that m was trained for). No labelled data in the target domain (new setting) are required for the adaptation guidance. NLP: natural language processing.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e14782_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>1) Vectorize phenotype mentions in a new task: Each mention in the new task will be represented as a vector of real numbers using the learned embedding model to combine its surrounding words as context semantics. Formally, the reference is chosen as shown in <xref ref-type="boxed-text" rid="box2">Textbox 2</xref>.</p>
        <boxed-text id="box2" position="float">
          <title>Vector representation of a phenotype mention.</title>
          <p>Let <italic>s</italic> be a mention identified by <italic>m</italic> in the new task, where <italic>s</italic>can be represented by a function defined as follows:</p>
          <p><graphic xlink:href="medinform_v7i4e14782_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>(1)</p>
          <p>Where</p>
          <p>
            <graphic xlink:href="medinform_v7i4e14782_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </p>
          <p>is the embedding model to convert a word token into a vector, <italic>t<sub>j</sub></italic> is the j<sup>th</sup> word in a document, <italic>i</italic> is the offset of the first word of <italic>s</italic> in the document, <italic>l</italic> is the number of words in <italic>s</italic>, and <italic>f</italic> is a function to combine a set of vectors into a result vector (we use <italic>average</italic> in our implementation).</p>
        </boxed-text>
        <p>With such representations, all mentions are effectively put in a vector space (depicted as a 2D space on the right of the figure for illustration purposes).</p>
        <p>2) Identify clusters (language patterns) of mention vectors: In the vector space, clusters are naturally formed based on geometric distances between mention vectors. After trying different clustering algorithms and parameters, DBScan [<xref ref-type="bibr" rid="ref31">31</xref>] was chosen on Euclidean distance in our implementation for vector clustering. Essentially, each cluster is a set of mentions considered to share the same (or similar enough) underlying language pattern, meaning that language patterns in the new task are technically the vector clusters. We chose the cluster centroid (arithmetic mean) to represent a cluster (ie, its underlying language pattern).</p>
        <p>3) Choose a reference vector for classifying language patterns: After clusters (language patterns) are identified, the next step is to classify them as p-known or subsets of p-unknown. We choose a reference vector–based approach, classifying patterns using the distance to a selected vector. Such a reference vector is picked up (when the phenotype to be identified has been trained in m) or generated (when the phenotype is new to m) from the learned phenotype embeddings the model m has seen previously. Apparently, when the phenotype to be identified in the new task is new to m (not in the set of phenotypes it was developed for), the reference phenotype needs to be carefully selected, so that it can help produce a sensible separation between p-known and p-unknown clusters. We use the semantic similarity (distance between two concepts in the UMLS tree structure) to choose the most similar phenotype from the phenotype list m was trained for. Formally, the reference is chosen as shown in <xref ref-type="boxed-text" rid="box3">Textbox 3</xref>.</p>
        <boxed-text id="box3" position="float">
          <title>Reference phenotype selection</title>
          <p>Let <italic>c<sub>p</sub></italic> be the Unified Medical Language System concept for a phenotype to be identified in the new task and <italic>C<sub>m</sub></italic> be the set of phenotype concepts that <italic>m</italic> was trained for, the reference phenotype choosing function is <graphic xlink:href="medinform_v7i4e14782_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (2)</p>
          <p>Where <italic>D</italic> is a distance function to calculate the steps between two nodes in the Unified Medical Language System concept tree.</p>
        </boxed-text>
        <p>Once the reference phenotype has been chosen, the reference vector can be selected or generated (eg, use the average) from this phenotype’s contextual embeddings.</p>
        <p>4) Classify language patterns to guide model adaptation: Once the reference vector has been selected, clusters can be classified based on the distances between their centroids (representative vectors of clusters) and the reference vector. Once a distance threshold is chosen, this distance-based classification partitions the vector space into two subspaces using the reference vector as the center: the subspace whose distance to the center is less than the threshold is called p-known subspace and the remainder is the p-unknown subspace. The union of clusters whose centroids are within the p-known subspace is p-known, meaning m’s performances on them can be predicted without further validation (removing duplicate waste). Other clusters are p-unknown clusters, and m can be validated or further trained on each p-unknown cluster separately instead of blindly across all clusters. This will remove imbalance waste.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Associations Between Embedding-Based Language Patterns and Model Performances</title>
        <p>As stated in the beginning of Method section, our approach is based on three assumptions about language patterns. Therefore, it is essential to quantify to what extent the language patterns identified by our embedding-based approach meet these assumptions. The first assumption—a phenotype mention can be assigned to one and only to one language pattern—is met in our approach, since (1) (Equation 1) is a one-to-one function and (2) DBScan algorithm (the vector clustering function chosen in our implementation) is also a one-to-one function. Assumption 2 can be quantified by the percentage of mentions that can be assigned to a cluster. This percentage can be increased by increasing the epsilon (EPS) parameter (the maximum distance between two data items for them to be considered in the same neighborhood) in DBScan. However, the degree to which mentions are clustered together needs to be balanced against the consequence of the reduced ability to identify performance-related language patterns, which is the third assumption: associations between language patterns and model performance. To quantify such an association, we propose a metric called bad guy separate power (SP), as defined in Equation 3 below (<xref ref-type="boxed-text" rid="box4">Textbox 4</xref>). The aim is to measure to what extent a clustering can assemble incorrect data items (false-positive mentions of phenotypes) together.</p>
        <boxed-text id="box4" position="float">
          <title>Bad Guy Separate Power.</title>
          <p>Let C be a set of binary data items – <graphic xlink:href="medinform_v7i4e14782_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (stands for true; stands for false), given a clustering result {<italic>C<sub>1</sub></italic>…<italic>C<sub>k</sub></italic>｜<italic>C<sub>1</sub></italic>∪<italic>C<sub>2</sub></italic>…∪<italic>C<sub>k</sub></italic>=<italic>C</italic>}, its separate power for <italic>f</italic> typed data items is defined as follows:</p>
          <p><graphic xlink:href="medinform_v7i4e14782_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (3)</p>
        </boxed-text>
        <p>In our scenario, we would like to see clustering being able to separate easy cases (where good performance is achieved) from difficult cases (where performance is poor) for a model .</p>
        <p>To quantify the clustering percentage, the ability to separate mentions based on model performances and the interplay between the two, we conducted experiments on selected phenotypes by continuously increasing the clustering parameter</p>
        <p>EPS from a low level. <xref rid="figure4" ref-type="fig">Figure 4</xref> shows the results. In this experiment, we label mentions into two types—correct and incorrect—using SemEHR labelled data on the SLaM corpus. Specifically, for the mention types in <xref ref-type="table" rid="table1">Table 1</xref>, incorrect mentions are those denoted “not-a-phenotype-mention” and the remainder are labelled as correct. We chose incorrect as the <italic>f</italic> in equation 3, as we evaluate the separate power on incorrect mentions. Four phenotypes were selected for this evaluation: <italic>Diabetes</italic> and <italic>Hypertensive disease</italic> were selected because they were most validated phenotypes and <italic>Abscess</italic> (with 13% incorrect mentions) and <italic>Blindness</italic> (with 47% incorrect mentions) were chosen to represent NLP models with different levels of performance. The figure shows a clear trend in all cases: As EPS increases, the clustered percentage increases, but with decreasing separate power. This confirms a trade-off between the coverage of identified language patterns and how good they are. Regarding separate power, the performance on two selected common phenotypes (<xref rid="figure4" ref-type="fig">Figure 4</xref>a and <xref rid="figure5" ref-type="fig">4</xref>b) is generally worse than that for the other phenotypes, starting with lower power, which decreases faster as the EPS increases. The main reason is that the difficult cases (mentions with poor performance) in the two commonly encountered phenotypes are relatively rare (diabetes: 8.5%; hypertensive disease: 5.5%). In such situations, difficult cases are harder to separate because their patterns are underrepresented. However, in general, compared to random clustering, the embedding-based clustering approach brings in much better separate power in all cases. This confirms a high-level association between identified clusters and model performance. In particular, when the proportion of difficult cases reaches near 50% (<xref rid="figure4" ref-type="fig">Figure 4</xref>d), the approach can keep <italic>SP</italic> values almost constantly near 1.0 when the EPS increases. This means it can almost always group difficult cases in their own clusters.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Clustered percentage versus separate power on difficult cases. The x-axis is the Epsilon (EPS) parameter of the DBScan clustering algorithm---the longest distance between any two items within a cluster; the y-axis is the percentage. Two types of changing information (as functions of EPS) are plotted on each panel: clustered percentage (solid line) and SP on incorrect cases (false-positive mentions of phenotypes). The latter has two series: (1) SP by chance (dash dotted line) when clustering by randomly selecting mentions and (2) SP by clustering using phenotype embedding (dashed line). N: number of all mentions; N_f: number of false-positive mentions; SP: separate power.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e14782_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Model Adaptation Guidance Evaluation</title>
        <p>Technically, the guidance to model adaptation is composed of two parts: avoid <italic>duplicate waste</italic> (skip validation/training efforts on cases the model is already familiar with) and avoid <italic>imbalance waste</italic> (group new language patterns together, so that validation/continuous training on each group separately can be more efficient than doing it over the whole corpus). To quantify the guidance effectiveness, the following metrics are introduced.</p>
        <list list-type="bullet">
          <list-item>
            <p>Duplicate waste: This is the number of mentions whose patterns fall into what the model m is familiar with. The quantity <graphic xlink:href="medinform_v7i4e14782_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> is the proportion of mentions that needs no validation or retraining before reusing .</p>
          </list-item>
          <list-item>
            <p>Imbalance waste: To achieve convergence performance, an NLP model needs to be trained on a minimal number (denoted as e) of samples from each language pattern. Calling the language pattern set in a new task as C={C1…Ck}, the following equation counts the minimum number of samples needed to achieve convergent results in “blind” adaptations:</p>
          </list-item>
        </list>
        <p><graphic xlink:href="medinform_v7i4e14782_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (4)</p>
        <p>When the language patterns are identifiable, the <italic>Imbalance waste</italic> that can be avoided is quantified as <graphic xlink:href="medinform_v7i4e14782_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/></p>
        <list list-type="bullet">
          <list-item>
            <p>Accuracy: To evaluate whether our approach can really identify familiar patterns, we quantify the accuracy of those within-threshold clusters and those within-threshold single mentions that are not clustered. Both macro-accuracy (average of all cluster accuracies) and micro-accuracy (overall accuracy) are used (detailed explanations provided elsewhere [<xref ref-type="bibr" rid="ref32">32</xref>]).</p>
          </list-item>
        </list>
        <p><xref rid="figure5" ref-type="fig">Figure 5</xref> shows the results of our NLP model adaptation guidance on four phenotype-identification tasks. For each new phenotype-identification task, the NLP model (pre)trained for the semantically most similar (defined in Equation 2) phenotype was chosen as the reuse model. Models and labelled data for the four pairs of phenotypes were selected from six physical comorbidity studies on SLaM data. <xref rid="figure5" ref-type="fig">Figure 5</xref> shows that identified mentions have a high proportion of avoidable duplicate waste in all four cases: Diabetes and heart attack start with 50%, whereas stroke and multiple sclerosis are &#62;70%. Such avoidable duplicate waste decreases when the threshold increases. The threshold is on similarity instead of distance, meaning that new patterns need to be more similar to the reuse model’s embeddings to be counted as familiar patterns. Therefore, it is understandable that duplicate waste decreases in such scenarios. In terms of accuracy, one would expect this to increase, as only more similar patterns are left when the threshold increases. However, interestingly, in all cases, both macro- and micro-accuracies decrease slightly before increasing to reach near 1.0. This is a phenomenon worth future investigation. In general, the changes in accuracy are small (0.03-0.08), while accuracy remains high (&#62;0.92). Given these observations, the threshold is normally set at 0.01, to optimize the avoidance of duplicate waste with minimal effect on accuracy. Specifically, in all cases, more than half of the identified mentions (&#62;50% for <xref rid="figure5" ref-type="fig">Figure 5</xref>a and <xref rid="figure5" ref-type="fig">5</xref>b; &#62;70% for <xref rid="figure5" ref-type="fig">Figure 5</xref>c and <xref rid="figure5" ref-type="fig">5</xref>d) do not need any validation/training to obtain an accuracy of &#62;0.95. In terms of effective adaptation on new patterns, the percentages of avoidable imbalance waste in all cases are around 80%, confirming that a much more efficient retraining on data can be achieved through language pattern-based guidance.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Identifying new phenotypes by reusing natural language processing models pretrained for semantically close phenotypes: The four pairs of phenotype-mention identification models are chosen from SemEHR models trained on SLaM data; DBScan Epsilon (EPS) value=3.8, and imbalance waste is calculated on e=3, meaning at least 3 samples are needed for training from each language pattern. The x-axis is the similarity threshold, ranging from 0.0 to 0.8; the y-axes, from top to bottom, are the proportion of duplicate waste saved over total number of mentions, macro-accuracy, and micro-accuracy, respectively.</p>
          </caption>
          <graphic xlink:href="medinform_v7i4e14782_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Effectiveness of Phenotype Semantics in Model Reuse</title>
        <p>When considering NLP model reuse for a new task, if there is no existing model that has been developed for the same phenotype-mention identification task, our approach will choose a model trained for a phenotype that is most semantically similar to it (based on Equation 2). To evaluate the effectiveness of such semantic relationships in reusing NLP models, we conducted experiments on the previous four phenotypes by using phenotype models with different levels of semantic similarities. <xref ref-type="table" rid="table2">Table 2</xref> shows the results. In all cases, reusing models trained for more similar phenotypes can identify more <italic>duplicate waste</italic> using the same parameter settings. The first three cases in the table can also achieve better accuracies, while <italic>multiple sclerosis</italic> had slightly better accuracy by reusing the <italic>diabetes</italic> model than the more semantically similar <italic>myasthenia gravis</italic>. However, the latter identified 46% more <italic>duplicate waste</italic>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Comparisons of the performance of reusing models with different semantic similarity levels. Similarity threshold: 0.01; DBScan EPS: 0.38. Reusing models trained for more (semantically) similar phenotypes achieved adaptation results with less effort (more duplicate waste identified) in all cases, and the results were more accurate in three of four cases. Performance metrics of better reusable models are highlighted as bold numbers.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="250"/>
            <col width="250"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Model reuse cases</td>
                <td>Duplicate waste</td>
                <td>Macro-accuracy</td>
                <td>Micro-accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Diabetes by Type 2 Diabetes<sup>a</sup></td>
                <td>0.502<sup>b</sup></td>
                <td>0.966<sup>b</sup></td>
                <td>0.933<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>Diabetes by Hypercholesterolemia</td>
                <td>0.477</td>
                <td>0.965</td>
                <td>0.930</td>
              </tr>
              <tr valign="top">
                <td>Stroke by Heart Attack<sup>a</sup></td>
                <td>0.711<sup>b</sup></td>
                <td>0.948<sup>b</sup></td>
                <td>0.955<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>Stroke by Fatigue</td>
                <td>0.220</td>
                <td>0.884</td>
                <td>0.938</td>
              </tr>
              <tr valign="top">
                <td>Heart attack by Infarct<sup>a</sup></td>
                <td>0.569<sup>b</sup></td>
                <td>0.989<sup>b</sup></td>
                <td>0.966<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>Heart attack by Bruise</td>
                <td>0.529</td>
                <td>0.821</td>
                <td>0.889</td>
              </tr>
              <tr valign="top">
                <td>Multiple Sclerosis by Myasthenia Gravis<sup>a</sup></td>
                <td>0.761<sup>b</sup></td>
                <td>0.944</td>
                <td>0.971</td>
              </tr>
              <tr valign="top">
                <td>Multiple Sclerosis by Diabetes</td>
                <td>0.522</td>
                <td>0.993<sup>b</sup></td>
                <td>0.979<sup>b</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>More similar model reuse cases.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Performance metrics of better reusable models.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ethical Approval and Informed Consent</title>
        <p>Deidentified patient records were accessed through the Clinical Record Interactive Search at the Maudsley NIHR Biomedical Research Centre, South London, and Maudsley (SLaM) NHS Foundation Trust. This is a widely used clinical database with a robust data governance structure, which has received ethical approval for secondary analysis (Oxford REC 18/SC/0372).</p>
      </sec>
      <sec>
        <title>Data Availability Statement</title>
        <p>The clinical notes are not sharable in the public domain. However, interested researchers can apply for research access through https://www.maudsleybrc.nihr.ac.uk/facilities/clinical- record-interactive-search-cris/. The natural language processing tool, models, and code of this work are available at https://github.com/CogStack/CogStack-SemEHR.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>Automated extraction methods (as surveyed recently by Ford and et al [<xref ref-type="bibr" rid="ref33">33</xref>]), many of which are freely available and open source, have been intensively investigated in mining free-text medical records [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. To provide guidance in the efficient reuse of pretrained NLP models, we have proposed an approach that can automatically (1) identify easy cases in a new task for the reused model, on which it can achieve good performance with high confidence and (2) classify the remainder of the cases, so that the validation or retraining on them can be conducted much more efficiently, compared to adapting the model on all cases. Specifically, in four phenotype-mention identification tasks, we have shown that 50%-79% of all mentions are identifiably easy cases, for which our approach can choose the best model to reuse, achieving more than 93% accuracy. Furthermore, for those cases that need validation or retraining, our approach can provide guidance that can save 78%-85% of the validation/retraining effort. A distinct feature of this approach is that it requires no labelled data from new settings, which enables very efficient model adaptation, as shown in our evaluation: zero effort to obtain &#62;93% accuracy among the majority (&#62;63% in average) of the results.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>In this study, we did not evaluate the recall of adapted NLP models in new tasks. Although the models we chose can generally achieve very good recall for identifying physical conditions (96%-98%) within the SLaM records, investigating the transferability on recalls is an important aspect of NLP model adaptation.</p>
        <p>The model reuse experiments were conducted on identifying new phenotypes on document sets that had not previously been seen by the NLP model. However, these documents were still part of the same (SLaM) EHR system. To fully test the generalizability of our approach will require evaluation of model reuse in a different EHR system, which will require a new set of access approvals as well as information governance approval for the sharing of embedding models between different hospitals.</p>
        <p>We chose a phenotype embedding model to represent language patterns. One reason is that we have a limited number of manually annotated data items. The word embedding approach is unsupervised, and the word-level “semantics” learned from the whole corpus can help group similar words together in the vector space, so that it can help improve the phenotype-level clustering performances. However, thorough comparisons between different language pattern models are needed to reveal whether other approaches, in particular, simpler or less computing-intensive approaches can achieve similar or different performances.</p>
        <p>In addition, implementation-wise, vector clustering is an important aspect of this approach. We have compared DBScan with k-nearest neighbors algorithm in our model, which revealed that DBScan could achieve better SP powers in most scenarios. Using a 64-bit Windows 10 server with 16 GB memory and 8 core central processing units (3.6 GHz), DBScan uses 200 MB memory and takes 0.038 seconds on about 300 data points on average of 100 executions. However, it is worth the in-depth comparisons between more clustering algorithms. In particular, a larger dataset might be needed to compare the clustering performances on both computational aspect and SP powers.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>NLP model adaptation aims to adapt NLP models from a source domain (with abundant labelled data) to a target domain (with limited labelled data). This challenge has been extensively studied in the NLP community [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]. However, most existing approaches assume a single language model (eg, a probability distribution) from each domain. This limits the ability to identify and subsequently deal differently with data items with different language patterns. Such a limitation prevents fine-grained adaptations, such as the reuse or adaptation of one NLP model on those items for which it performs well, and the retraining of the same model or reuse of other models on those items for which the original NLP model performs poorly. In contrast, our work aimed to depict the language patterns (ie, different language models) of both source and target domains and subsequently provide actionable guidance on reusing models based on these fine-grained language patterns. Further, very few NLP model reuse studies have focused on free text in electronic medical records. To the best of our knowledge, this work is among the first to focus on model reuse for phenotype-mention identification tasks on real-world free-text electronic medical records.</p>
        <p>Modelling language patterns have been investigated for different applications, such as the k-Signature approach [<xref ref-type="bibr" rid="ref42">42</xref>] for identifying unique “signatures” of micro-message authors. This paper models language patterns for characterizing “landscape” of phenotype mentions. One main difference is that we do not know how many clusters (or “signatures”) of language patterns exist in our scenario. Technically, we use phenotype embeddings to model such patterns and, particularly, utilize phenotype semantic similarities (based on ontology hierarchies) for reusing learned embeddings, when necessary.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Making fine-grained language patterns visible and comparable (in computable form) is the key to supporting “smart” NLP model adaptation. We have shown that the phenotype embedding-based approach proposed in this paper is an effective way to achieve this. However, our approach is just one way to model such fine-grained patterns. Investigating novel pattern representation models is an exciting research direction to enable automated NLP model adaptation and composition (ie, combining various models together) for efficiently mining free-text electronic medical records in new settings with maximum efficiency and minimal effort.</p>
        <p/>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>User interface and model performances of phenotype natural language processing models.</p>
        <media xlink:href="medinform_v7i4e14782_app1.docx" xlink:title="DOCX File , 968 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Proof of Theorem 1.</p>
        <media xlink:href="medinform_v7i4e14782_app2.docx" xlink:title="DOCX File , 8 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BOW</term>
          <def>
            <p>bag of words</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EPS</term>
          <def>
            <p>epsilon</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SLaM</term>
          <def>
            <p>South London and Maudsley NHS Foundation Trust</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">SP</term>
          <def>
            <p>separate power</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research was funded by Medical Research Council/Health Data Research UK Grant (MR/S004149/1), Industrial Strategy Challenge Grant (MC_PC_18029), and the National Institute for Health Research (NIHR) Biomedical Research Centre at South London and Maudsley NHS Foundation Trust and King’s College London. The views expressed are those of the authors and not necessarily those of the NHS, the NIHR, or the Department of Health and Social Care.</p>
      <p/>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Anzaldi</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Leff</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kimura</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>The Value of Unstructured Electronic Health Record Data in Geriatric Syndrome Case Identification</article-title>
          <source>J Am Geriatr Soc</source>
          <year>2018</year>
          <month>08</month>
          <volume>66</volume>
          <issue>8</issue>
          <fpage>1499</fpage>
          <lpage>1507</lpage>
          <pub-id pub-id-type="doi">10.1111/jgs.15411</pub-id>
          <pub-id pub-id-type="medline">29972595</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Perera</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Broadbent</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Callard</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Downs</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dutta</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Fernandes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hayes</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Henderson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jewell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kadra</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Little</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pritchard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shetty</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tulloch</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Cohort profile of the South London and Maudsley NHS Foundation Trust Biomedical Research Centre (SLaM BRC) Case Register: current status and recent enhancement of an Electronic Mental Health Record-derived data resource</article-title>
          <source>BMJ Open</source>
          <year>2016</year>
          <month>03</month>
          <day>01</day>
          <volume>6</volume>
          <issue>3</issue>
          <fpage>e008721</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmjopen.bmj.com/content/6/3/e008721"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2015-008721</pub-id>
          <pub-id pub-id-type="medline">26932138</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2015-008721</pub-id>
          <pub-id pub-id-type="pmcid">PMC4785292</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roque</surname>
              <given-names>FS</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Schmock</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dalgaard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Andreatta</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Søeby</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bredkjær</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Juul</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Werge</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Brunak</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Using Electronic Patient Records to Discover Disease Correlations and Stratify Patient Cohorts</article-title>
          <source>PLoS Comput Biol</source>
          <year>2011</year>
          <month>8</month>
          <day>25</day>
          <volume>7</volume>
          <issue>8</issue>
          <fpage>e1002141</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1002141</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Byrd</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ebadollahi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Daar</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Early detection of heart failure with varying prediction windows by structured and unstructured data in electronic health records Internet</article-title>
          <year>2015</year>
          <conf-name>37th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>Milano, Italy</conf-loc>
          <pub-id pub-id-type="doi">10.1109/embc.2015.7318907</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abhyankar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Callaghan</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Combining structured and unstructured data to identify a cohort of ICU patients who received dialysis</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2014</year>
          <month>09</month>
          <volume>21</volume>
          <issue>5</issue>
          <fpage>801</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/21/5/801/757962"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2013-001915</pub-id>
          <pub-id pub-id-type="medline">24384230</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2013-001915</pub-id>
          <pub-id pub-id-type="pmcid">PMC4147606</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Margulis</surname>
              <given-names>AV</given-names>
            </name>
            <name name-style="western">
              <surname>Fortuny</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kaye</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Calingaert</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Reynolds</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Plana</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>McQuay</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Atsma</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Franks</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>de Vogel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Perez-Gutthann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Arana</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Value of Free-text Comments for Validating Cancer Cases Using Primary-care Data in the United Kingdom</article-title>
          <source>Epidemiology</source>
          <year>2018</year>
          <volume>29</volume>
          <issue>5</issue>
          <fpage>e41</fpage>
          <lpage>e42</lpage>
          <pub-id pub-id-type="doi">10.1097/ede.0000000000000856</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bell</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kilic</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Prabakaran</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>YY</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Broadbent</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Curtis</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Use of electronic health records in identifying drug and alcohol misuse among psychiatric in-patients</article-title>
          <source>Psychiatrist</source>
          <year>2018</year>
          <month>01</month>
          <day>02</day>
          <volume>37</volume>
          <issue>1</issue>
          <fpage>15</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cambridge.org/core/services/aop-cambridge-core/content/view/7C7BEF23485C724728CCDDBD3FBC1E90/S1758320900008246a.pdf/use_of_electronic_health_records_in_identifying_drug_and_alcohol_misuse_among_psychiatric_inpatients.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1192/pb.bp.111.038240</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jackson MSc</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Ball</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hayes</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Dobson</surname>
              <given-names>RJB</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>TextHunter--A User Friendly Tool for Extracting Generic Concepts from Free Text in Clinical Research</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2014</year>
          <volume>2014</volume>
          <fpage>729</fpage>
          <lpage>38</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25954379"/>
          </comment>
          <pub-id pub-id-type="medline">25954379</pub-id>
          <pub-id pub-id-type="pmcid">PMC4420012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Masanz</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ogren</surname>
              <given-names>PV</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kipper-Schuler</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <month>09</month>
          <day>01</day>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>507</fpage>
          <lpage>513</lpage>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Toti</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Morley</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ibrahim</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Folarin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kartoglu</surname>
              <given-names>Ismail</given-names>
            </name>
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>Asha</given-names>
            </name>
            <name name-style="western">
              <surname>Stringer</surname>
              <given-names>Clive</given-names>
            </name>
            <name name-style="western">
              <surname>Gale</surname>
              <given-names>Darren</given-names>
            </name>
            <name name-style="western">
              <surname>Gorrell</surname>
              <given-names>Genevieve</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>Angus</given-names>
            </name>
            <name name-style="western">
              <surname>Broadbent</surname>
              <given-names>Matthew</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Dobson</surname>
              <given-names>Richard J B</given-names>
            </name>
          </person-group>
          <article-title>SemEHR: A general-purpose semantic search system to surface semantic data from clinical notes for tailored care, trial recruitment, and clinical research</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>05</month>
          <day>01</day>
          <volume>25</volume>
          <issue>5</issue>
          <fpage>530</fpage>
          <lpage>537</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29361077"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx160</pub-id>
          <pub-id pub-id-type="medline">29361077</pub-id>
          <pub-id pub-id-type="pii">4817428</pub-id>
          <pub-id pub-id-type="pmcid">PMC6019046</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Christoph</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Griebel</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Leb</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Engel</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Köpcke</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Toddenroth</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Prokosch</surname>
              <given-names>H-</given-names>
            </name>
            <name name-style="western">
              <surname>Laufer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Marquardt</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sedlmayr</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Secure Secondary Use of Clinical Data with Cloud-based NLP Services</article-title>
          <source>Methods Inf Med</source>
          <year>2018</year>
          <month>01</month>
          <day>22</day>
          <volume>54</volume>
          <issue>03</issue>
          <fpage>276</fpage>
          <lpage>282</lpage>
          <pub-id pub-id-type="doi">10.3414/me13-01-0133</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tablan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bontcheva</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>GATECloud.net: a platform for large-scale, open-source text processing on the cloud</article-title>
          <source>Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences</source>
          <year>2012</year>
          <month>12</month>
          <day>10</day>
          <volume>371</volume>
          <issue>1983</issue>
          <fpage>20120071</fpage>
          <lpage>20120071</lpage>
          <pub-id pub-id-type="doi">10.1098/rsta.2012.0071</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chard</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Russell</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lussier</surname>
              <given-names>YA</given-names>
            </name>
            <name name-style="western">
              <surname>Mendonça</surname>
              <given-names>Eneida A</given-names>
            </name>
            <name name-style="western">
              <surname>Silverstein</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>A cloud-based approach to medical NLP</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2011</year>
          <volume>2011</volume>
          <fpage>207</fpage>
          <lpage>16</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22195072"/>
          </comment>
          <pub-id pub-id-type="medline">22195072</pub-id>
          <pub-id pub-id-type="pmcid">PMC3243210</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Eyler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mandelin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zink</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pacheco</surname>
              <given-names>Jennifer A</given-names>
            </name>
            <name name-style="western">
              <surname>Boomershine</surname>
              <given-names>Chad S</given-names>
            </name>
            <name name-style="western">
              <surname>Lasko</surname>
              <given-names>Thomas A</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Hua</given-names>
            </name>
            <name name-style="western">
              <surname>Karlson</surname>
              <given-names>Elizabeth W</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>Raul G</given-names>
            </name>
            <name name-style="western">
              <surname>Gainer</surname>
              <given-names>Vivian S</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>Shawn N</given-names>
            </name>
            <name name-style="western">
              <surname>Ruderman</surname>
              <given-names>Eric M</given-names>
            </name>
            <name name-style="western">
              <surname>Pope</surname>
              <given-names>Richard M</given-names>
            </name>
            <name name-style="western">
              <surname>Plenge</surname>
              <given-names>Robert M</given-names>
            </name>
            <name name-style="western">
              <surname>Kho</surname>
              <given-names>Abel Ngo</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>Katherine P</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>Joshua C</given-names>
            </name>
          </person-group>
          <article-title>Portability of an algorithm to identify rheumatoid arthritis in electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2012</year>
          <month>06</month>
          <volume>19</volume>
          <issue>e1</issue>
          <fpage>e162</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22374935"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000583</pub-id>
          <pub-id pub-id-type="medline">22374935</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2011-000583</pub-id>
          <pub-id pub-id-type="pmcid">PMC3392871</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>ZS</given-names>
            </name>
          </person-group>
          <article-title>Distributional Structure</article-title>
          <source>WORD</source>
          <year>2015</year>
          <month>12</month>
          <day>04</day>
          <volume>10</volume>
          <issue>2-3</issue>
          <fpage>146</fpage>
          <lpage>162</lpage>
          <pub-id pub-id-type="doi">10.1080/00437956.1954.11659520</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>CS</given-names>
            </name>
          </person-group>
          <article-title>A vector space model for automatic indexing</article-title>
          <source>Commun ACM</source>
          <year>1975</year>
          <volume>18</volume>
          <issue>11</issue>
          <fpage>613</fpage>
          <lpage>620</lpage>
          <pub-id pub-id-type="doi">10.1145/361219.361220</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>PF</given-names>
            </name>
            <name name-style="western">
              <surname>Desouza</surname>
              <given-names>PV</given-names>
            </name>
            <name name-style="western">
              <surname>Mercer</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Pietra</surname>
              <given-names>VJD</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Class-based n-gram models of natural language</article-title>
          <source>Computational Linguistics</source>
          <year>1992</year>
          <volume>18</volume>
          <fpage>479</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deerwester</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dumais</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Furnas</surname>
              <given-names>GW</given-names>
            </name>
            <name name-style="western">
              <surname>Landauer</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>Harshman</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Indexing by latent semantic analysis</article-title>
          <source>J Am Soc Inf Sci</source>
          <year>1990</year>
          <month>09</month>
          <volume>41</volume>
          <issue>6</issue>
          <fpage>391</fpage>
          <lpage>407</lpage>
          <pub-id pub-id-type="doi">10.1002/(sici)1097-4571(199009)41:6&#60;391::aid-asi1&#62;3.0.co;2-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Latent Dirichlet Allocation</article-title>
          <source>Journal of Machine Learning Research</source>
          <year>2003</year>
          <volume>3</volume>
          <fpage>933</fpage>
          <lpage>1022</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>Carnegie-Mellon University</source>
          <year>1984</year>
          <access-date>2019-11-06</access-date>
          <comment>Distributed representations. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cs.toronto.edu/~hinton/absps/pdp3.pdf">http://www.cs.toronto.edu/~hinton/absps/pdp3.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ducharme</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Vincent</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jauvin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A neural probabilistic language model</article-title>
          <source>Journal of machine learning research</source>
          <year>2003</year>
          <volume>3</volume>
          <fpage>1137</fpage>
          <lpage>1155</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collobert</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Weston</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A unified architecture for natural language processing: Deep neural networks with multitask learning</article-title>
          <year>2008</year>
          <conf-name>Proceedings of the 25th international conference on Machine learning</conf-name>
          <conf-date>2008</conf-date>
          <conf-loc>Helsinki, Finland</conf-loc>
          <fpage>160</fpage>
          <lpage>167</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Glorot</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Bordes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Domain adaptation for large-scale sentiment classification: A deep learning approach</article-title>
          <year>2011</year>
          <conf-name>The 28th international conference on machine learning</conf-name>
          <conf-date>2011</conf-date>
          <conf-loc>Bellevue, Washington</conf-loc>
          <fpage>513</fpage>
          <lpage>520</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>Advances in neural information processing systems</source>
          <year>2013</year>
          <conf-name>Neural Information Processing Systems (NIPS)</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Lake Tahoe, Nevada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gouws</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Bilbowa: Fast bilingual distributed representations without word alignments</article-title>
          <year>2015</year>
          <conf-name>The 32nd International Conference on Machine Learning</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>Lille, France</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hill</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Korhonen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Learning Distributed Representations of Sentences from Unlabelled Data Internet</article-title>
          <source>Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2016</year>
          <conf-name>NAACL 2016</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>San Diego, California</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Deep Contextualized Word Representations Internet</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</source>
          <year>2018</year>
          <conf-name>NAACL 2018</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>New Orleans, Louisiana</conf-loc>
          <fpage>2227</fpage>
          <lpage>2237</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McCann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bradbury</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Learned in translation: Contextualized word vectors</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2017</year>
          <conf-name>NIPS 2017</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>California</conf-loc>
          <fpage>6294</fpage>
          <lpage>6305</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ammar</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Bhagavatula</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Power</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised sequence tagging with bidirectional language models Internet</article-title>
          <source>Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>
          <year>2017</year>
          <conf-name>ACL 2017</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bodenreider</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>The Unified Medical Language System (UMLS): integrating biomedical terminology</article-title>
          <source>Nucleic Acids Res</source>
          <year>2004</year>
          <month>01</month>
          <day>01</day>
          <volume>32</volume>
          <issue>Database issue</issue>
          <fpage>D267</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/14681409"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkh061</pub-id>
          <pub-id pub-id-type="medline">14681409</pub-id>
          <pub-id pub-id-type="pii">32/suppl_1/D267</pub-id>
          <pub-id pub-id-type="pmcid">PMC308795</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schubert</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sander</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ester</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kriegel</surname>
              <given-names>HP</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>DBSCAN Revisited, Revisited</article-title>
          <source>ACM Trans Database Syst</source>
          <year>2017</year>
          <month>08</month>
          <day>24</day>
          <volume>42</volume>
          <issue>3</issue>
          <fpage>1</fpage>
          <lpage>21</lpage>
          <pub-id pub-id-type="doi">10.1145/3068335</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Van Asch</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <source>Macro-and micro-averaged evaluation measures</source>
          <year>2013</year>
          <access-date>2019-11-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://pdfs.semanticscholar.org/1d10/6a2730801b6210a67f7622e4d192bb309303.pdf">https://pdfs.semanticscholar.org/1d10/6a2730801b6210a67f7622e4d192bb309303.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ford</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cassell</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Extracting information from the text of electronic medical records to improve case detection: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2016</year>
          <month>09</month>
          <day>05</day>
          <volume>23</volume>
          <issue>5</issue>
          <fpage>1007</fpage>
          <lpage>15</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26911811"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv180</pub-id>
          <pub-id pub-id-type="medline">26911811</pub-id>
          <pub-id pub-id-type="pii">ocv180</pub-id>
          <pub-id pub-id-type="pmcid">PMC4997034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Trent Rosenbloom</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Giuse</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Blanquicett</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Soysal</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A long journey to short abbreviations: developing an open-source framework for clinical abbreviation recognition and disambiguation (CARD)</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2017</year>
          <month>04</month>
          <day>01</day>
          <volume>24</volume>
          <issue>e1</issue>
          <fpage>e79</fpage>
          <lpage>e86</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocw109</pub-id>
          <pub-id pub-id-type="medline">27539197</pub-id>
          <pub-id pub-id-type="pii">ocw109</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Ogren</surname>
              <given-names>PV</given-names>
            </name>
            <name name-style="western">
              <surname>Duffy</surname>
              <given-names>PH</given-names>
            </name>
            <name name-style="western">
              <surname>Buntrock</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Mayo Clinic NLP System for Patient Smoking Status Identification</article-title>
          <source>Journal of the American Medical Informatics Association</source>
          <year>2008</year>
          <month>01</month>
          <day>01</day>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>25</fpage>
          <lpage>28</lpage>
          <pub-id pub-id-type="doi">10.1197/jamia.m2437</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Albright</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lanfranchi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fredriksen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Styler</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Warner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hwang</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Dligach</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nielsen</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ward</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Palmer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
          </person-group>
          <article-title>Towards comprehensive syntactic and semantic annotations of the clinical narrative</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2013</year>
          <month>09</month>
          <day>01</day>
          <volume>20</volume>
          <issue>5</issue>
          <fpage>922</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23355458"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2012-001317</pub-id>
          <pub-id pub-id-type="medline">23355458</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2012-001317</pub-id>
          <pub-id pub-id-type="pmcid">PMC3756257</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moriokal</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tawara</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ogawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ogawa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Iwata</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kobayashi</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Language Model Domain Adaptation Via Recurrent Neural Networks with Domain-Shared and Domain-Specific Representations Internet</article-title>
          <source>2018 IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2018</year>
          <conf-name>2018 IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Calgary, Canada</conf-loc>
          <fpage>6084</fpage>
          <lpage>6088</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Samanta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised domain adaptation using eigenanalysis in kernel space for categorisation tasks Internet</article-title>
          <source>IET Image Processing</source>
          <year>2015</year>
          <volume>9</volume>
          <issue>11</issue>
          <fpage>925</fpage>
          <lpage>930</lpage>
          <pub-id pub-id-type="doi">10.1049/iet-ipr.2014.0754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Domain Adaptation for Sequence Labeling Tasks with a Probabilistic Language Adaptation Model</article-title>
          <year>2013</year>
          <conf-name>International Conference on Machine Learning 2013</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Atlanta, Georgia</conf-loc>
          <fpage>293</fpage>
          <lpage>301</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Instance-based Domain Adaptation via Multiclustering Logistic Approximation</article-title>
          <source>IEEE Intell Syst</source>
          <year>2018</year>
          <month>1</month>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>78</fpage>
          <lpage>88</lpage>
          <pub-id pub-id-type="doi">10.1109/mis.2018.012001555</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhai</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Instance weighting for domain adaptation in NLP</article-title>
          <source>Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics. Association for Computational Linguistics</source>
          <year>2007</year>
          <conf-name>ACL 2007</conf-name>
          <conf-date>2007</conf-date>
          <conf-loc>Prague, Czech Republic</conf-loc>
          <fpage>264</fpage>
          <lpage>271</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tsur</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Rappoport</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Koppel</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Authorship Attribution of Micro-Messages</article-title>
          <source>Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2013</year>
          <conf-name>EMNLP 2013</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Seattle, Washington</conf-loc>
          <fpage>1880</fpage>
          <lpage>1891</lpage>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
