<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i2e11499</article-id>
      <article-id pub-id-type="pmid">31021325</article-id>
      <article-id pub-id-type="doi">10.2196/11499</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Adapting State-of-the-Art Deep Language Models to Clinical Information Extraction Systems: Potentials, Challenges, and Solutions</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chu</surname>
            <given-names>Yuanchia</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Torii</surname>
            <given-names>Manabu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="author" id="contrib1" corresp="yes">
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>Liyuan</given-names>
          </name>
          <degrees>BSc, MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Research School of Computer Science</institution>
            <institution>College of Engineering and Computer Science</institution>
            <institution>The Australian National University</institution>
            <addr-line>108 North Road</addr-line>
            <addr-line>Canberra, 2600</addr-line>
            <country>Australia</country>
            <phone>61 (02) 6125 5111</phone>
            <email>annjouno@gmail.com</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-9046-6098</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib2">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>Hanna</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-4195-1641</ext-link>
        </contrib>
        <contrib contrib-type="author" id="contrib3">
          <name name-style="western">
            <surname>Gedeon</surname>
            <given-names>Tom</given-names>
          </name>
          <degrees>BSc (Hons), PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-8356-4909</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
      <label>1</label>
      <institution>Research School of Computer Science</institution>
      <institution>College of Engineering and Computer Science</institution>  
      <institution>The Australian National University</institution>  
      <addr-line>Canberra</addr-line>
      <country>Australia</country></aff>
      <aff id="aff2">
      <label>2</label>
      <institution>Machine Learning Group</institution>
      <institution>Data61</institution>  
      <institution>Commonwealth Scientific and Industrial Research Organisation</institution>  
      <addr-line>Canberra</addr-line>
      <country>Australia</country></aff>
      <aff id="aff3">
      <label>3</label>
      <institution>Faculty of Science and Technology</institution>
      <institution>University of Canberra</institution>  
      <addr-line>Canberra</addr-line>
      <country>Australia</country></aff>
      <aff id="aff4">
      <label>4</label>
      <institution>Department of Future Technologies</institution>
      <institution>Faculty of Science and Engineering</institution>  
      <institution>University of Turku</institution>  
      <addr-line>Turku</addr-line>
      <country>Finland</country></aff>
      <author-notes>
        <corresp>Corresponding Author: Liyuan Zhou 
        <email>annjouno@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection"><season>Apr-Jun</season><year>2019</year></pub-date>
      <pub-date pub-type="epub">
        <day>25</day>
        <month>04</month>
        <year>2019</year>
      </pub-date>
      <volume>7</volume>
      <issue>2</issue>
      <elocation-id>e11499</elocation-id>
      <!--history from ojs - api-xml-->
      <history>
        <date date-type="received">
          <day>6</day>
          <month>7</month>
          <year>2018</year>
        </date>
        <date date-type="rev-request">
          <day>7</day>
          <month>10</month>
          <year>2018</year>
        </date>
        <date date-type="rev-recd">
          <day>2</day>
          <month>2</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>17</day>
          <month>2</month>
          <year>2019</year>
        </date>
      </history>
      <copyright-statement>©Liyuan Zhou, Hanna Suominen, Tom Gedeon. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 25.04.2019.</copyright-statement>
      <copyright-year>2019</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2019/2/e11499/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Deep learning (DL) has been widely used to solve problems with success in speech recognition, visual object recognition, and object detection for drug discovery and genomics. Natural language processing has achieved noticeable progress in artificial intelligence. This gives an opportunity to improve on the accuracy and human-computer interaction of clinical informatics. However, due to difference of vocabularies and context between a clinical environment and generic English, transplanting language models directly from up-to-date methods to real-world health care settings is not always satisfactory. Moreover, the legal restriction on using privacy-sensitive patient records hinders the progress in applying machine learning (ML) to clinical language processing.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this study was to investigate 2 ways to adapt state-of-the-art language models to extracting patient information from free-form clinical narratives to populate a handover form at a nursing shift change automatically for proofing and revising by hand: first, by using domain-specific word representations and second, by using transfer learning models to adapt knowledge from general to clinical English. We have described the practical problem, composed it as an ML task known as information extraction, proposed methods for solving the task, and evaluated their performance.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>First, word representations trained from different domains served as the input of a DL system for information extraction. Second, the transfer learning model was applied as a way to adapt the knowledge learned from general text sources to the task domain. The goal was to gain improvements in the extraction performance, especially for the classes that were topically related but did not have a sufficient amount of model solutions available for ML directly from the target domain. A total of 3 independent datasets were generated for this task, and they were used as the training (101 patient reports), validation (100 patient reports), and test (100 patient reports) sets in our experiments.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Our system is now the state-of-the-art in this task. Domain-specific word representations improved the macroaveraged F1 by 3.4%. Transferring the knowledge from general English corpora to the task-specific domain contributed a further 7.1% improvement. The best performance in populating the handover form with 37 headings was the macroaveraged F1 of 41.6% and F1 of 81.1% for filtering out irrelevant information. Performance differences between this system and its baseline were statistically significant (<italic>P</italic>&lt;.001; Wilcoxon test).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>To our knowledge, our study is the first attempt to transfer models from general deep models to specific tasks in health care and gain a significant improvement. As transfer learning shows its advantage over other methods, especially on classes with a limited amount of training data, less experts’ time is needed to annotate data for ML, which may enable good results even in resource-poor domains.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>computer systems</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>deep learning</kwd>
        <kwd>information storage and retrieval</kwd>
        <kwd>medical informatics</kwd>
        <kwd>nursing records</kwd>
        <kwd>patient handoff</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Machine learning (ML) is being studied and used in a variety of health informatics applications (eg, disease progression prediction, therapy planning, medical diagnostic reasoning, and automatic patient management) as a way to help clinical experts to improve the efficiency and quality of medical care [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>].</p>
        <p>A clear majority of these applications use supervised learning, which infers knowledge from labeled training data. However, because of stringent restrictions on the use of clinical data [<xref ref-type="bibr" rid="ref3">3</xref>], data collections on real health care scenarios that are open for research and development are very limited [<xref ref-type="bibr" rid="ref4">4</xref>]. Moreover, the few available sources have limitations such as research-only use [<xref ref-type="bibr" rid="ref5">5</xref>], nondisclosure of data [<xref ref-type="bibr" rid="ref6">6</xref>], or limited commercial licenses [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>Zheng et al [<xref ref-type="bibr" rid="ref8">8</xref>] proposed an information extraction (IE) framework called IDEAL-X, which uses online learning techniques to update the learning model based on user feedback. Although the performance of their system looks very impressive, the types of text this system is able to extract are limited to 5, and these types such as age, gender, and medicine can be easily retrieved with rule-based systems rather than ML systems. Leroy introduced a rule-based automated IE system that extracts diagnostic criteria from electronic health records for autism spectrum disorders [<xref ref-type="bibr" rid="ref9">9</xref>]. As the rules are manually generated based on human observations of 1 specific data set, their system cannot be generalized to other tasks.</p>
        <p>In our previous study [<xref ref-type="bibr" rid="ref4">4</xref>], we have already (1) discussed the importance of comprehensive record keeping along with information flow in health care in general and clinical handover in particular, (2) developed and freely released a set of 101 synthetic clinical handover cases with verbatim conversations and associated audio recordings constructed by a nurse with over 12 years of experience in clinical practice to make sure the cases are closely matched with the typical data found in a nursing shift change, and (3) introduced and evaluated a cascaded system that uses speech recognition (SR) to recognize verbal clinical handover information and IE to fill in a handover form for clinical proofing and sign-off (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>In this study, we have released another 2 datasets that follow exactly the same format as our first release to supplement the original dataset called the National Information and Communications Technology Australia (NICTA) Synthetic Nursing Handover Data. These 3 independent datasets target researchers who are training, validating, and testing ML-based SR and IE methods for the handover record-keeping task. A description of our dataset is available in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>More importantly, in this study, we have improved our IE performance by using an ML method, which learns from other data collections and transfers this knowledge to the handover task. Processing correctness is crucial in medical informatics applications; our benchmark results show that this task is very challenging [<xref ref-type="bibr" rid="ref4">4</xref>], and the previous state-of-the-art result on this task was only 38.2% on macro F1 [<xref ref-type="bibr" rid="ref10">10</xref>]. Even with our supplementary data, the size of the in-domain training set is still not adequate to train a traditional multilayer neural network (NN) model for our IE task composed as a 50-class classification.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>A processing pipeline that transforms verbal clinical handover information into electronic structured records automatically.</p>
          </caption>
          <graphic xlink:href="medinform_v7i2e11499_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Generating or getting access to a large manually labeled training corpus for this task is not easy. Fortunately, distributed word representations, which can be learned from unlabeled data, have recently been shown to have high utility in many natural language processing (NLP) applications [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. In this study, we have investigated whether pretrained word embeddings generated from general Web text could improve our system performance in the IE task, even though it is relatively domain-specific. Furthermore, we are also interested to discover if training supplementary word embeddings based on a domain-related corpus is more helpful than from a general English corpus.</p>
        <p>Transferring the knowledge learned from another domain to our task is another way to cope with the problem of lack of training instances. This method has shown its effectiveness in previous studies [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. In this study, we have implemented a transfer learning–based approach to adapt weights of features and labels from different source data corpora to gain an improvement in the clinical handover IE task. More specifically, if we define our current task as the target domain, then the dataset that we want to adapt weights from is the source domain, so we first train sequence labeling models on a source domain training corpus and then learn the correlations between source labels and the labels in our task. After this, we use the model parameters of each related class in the source model to initialize our conditional random field (CRF) model in our clinical handover IE task. To extend the study, we have also explored whether models learned from a source corpus, which is close to a clinical domain, are more helpful than models trained on a generic, large labeled corpus.</p>
        <p>To summarize the contributions of this study, we have released the data to study SR and IE and introduced a state-of-the-art IE method for the handover task. The method is based on transfer learning and compares with both the most recent deep learning (DL) approaches and more traditional CRFs for sequence labeling.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>NICTA Synthetic Nursing Handover Data</title>
        <p>To fulfil the purpose of constructing systems to automatically generate structuring of the narrative documents from nursing shift change speech and handover, NICTA Synthetic Nursing Handover Data [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>] was created at NICTA/Data61 from 2012 to 2016. Their main author was a professional Registered Nurse (RN), Maricel Angel, who has over 12 years of experience in clinical nursing, based on general practice in medical wards. Therefore, the text is very similar to real documents in typical clinical scenarios.</p>
        <p>This data collection of 301 records in total contained 3 disjoint subsets for training (101 records), validation (100 records), and testing (100 records; <xref ref-type="fig" rid="figure2">Figure 2</xref>). All 3 subsets were created under a consistent practice with the same standards as used by Suominen et al [<xref ref-type="bibr" rid="ref4">4</xref>]. Each record contains a patient profile; a written, free-form clinical handover for this profile; a voice speech record of the handover; and, finally, a written, structured document. To represent the most common chronic diseases and national health priority areas in Australia [<xref ref-type="bibr" rid="ref20">20</xref>], 4 kinds of patients (ie, cardiovascular, neurological, renal, and respiratory patients) were introduced into each subset and independently followed a uniform distribution to provide a balanced demographic sample. The structured document includes annotation of 5 classes (PATIENT INTRODUCTION, MY SHIFT, APPOINTMENTS, MEDICATION, and FUTURE CARE), which were further divided into 37 subclasses, supplemented by the category of not applicable (NA) for irrelevant information.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Descriptive statistics of text snippets highlighted in the training, validation, and test set.</p>
          </caption>
          <graphic xlink:href="medinform_v7i2e11499_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Word Representation</title>
        <p>Word embedding is a vector matrix learned from an unlabeled text corpus that maps vocabulary to a dense vector space. It attempts to model the distributional hypothesis that words that occur in similar contexts tend to be semantically similar. It has been shown to contribute to a variety of NLP tasks even without using any other features [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>To capture word vector representations from large amounts of unlabeled text, we adapted a skip-gram model [<xref ref-type="bibr" rid="ref22">22</xref>] that uses each current word to predict words in the neighboring context. The training objective of the skip-gram model is to maximize the averaged log probability over all training cases (<italic>T</italic>) of appearance of the context word <italic>w_{t+j}</italic> given the current word <italic>w_t</italic>, where <italic>j</italic> is the offset of the context word from the current word in a context window size of <italic>c</italic>:</p>
        <disp-formula>1/T∑_{T}^{t=1}(∑_{-c≤j≤c,j≠0}(log p(w_{t+j}|w_t)))</disp-formula>
        <p>Then, applies softmax to each context word <italic>w_O</italic> of a given occurrence of word <italic>w_I</italic>:</p>
        <disp-formula>P(w_O|w_I)=exp(v’^{T}_{w_O}v_{w_I})/∑^W_{w=1} exp(v’^T_w v_{w_I})</disp-formula>
        <p>...where v_w is the input and v’_w is the output word embedding of a word <italic>w</italic>, and <italic>W</italic> is the size of the training vocabulary.</p>
        <p>Out of the 2 variations to optimize computational efficiency of the skip-gram model, we have used negative sampling rather than hierarchical softmax because, for sequence tagging tasks in NLP, it can maintain more semantic information during the training process [<xref ref-type="bibr" rid="ref23">23</xref>] and obtain better results [<xref ref-type="bibr" rid="ref24">24</xref>]. Rather than calculating <italic>exp(v’^T_w v_{w_I})</italic> for all <italic>w</italic> in the vocabulary when <italic>calculating log p(w_{t+j}|w_t)</italic>, negative sampling replaces it with a logistic regression and distinguishes a context word <italic>w_O</italic> from noise (negative samples):</p>
        <disp-formula>log σ(v’^T_{w_O} v_{w_I}) + ∑^k_{i=1} E_{w_i~U(w)} [log σ(-v’^T_{w_i} v_{w_I})],</disp-formula>
        <p>...where <italic>k</italic> is the amount of negative samples for each data sample, and <italic>U(w)</italic> is a unigram distribution of words.</p>
        <p>To integrate context information from a new domain-specific corpus <italic>D_1</italic> (eg, our clinical handover dataset), we need to update our existing word embeddings that were learned from a general large text collection <italic>D_0</italic>. However, this comes with a significant challenge in that the original word embeddings were trained on a very large corpus whereas our target domain training data are normally much smaller in size. More specifically, if we compare the vocabulary between <italic>D_0</italic> and <italic>D_1</italic>, <italic>V_0 ∩ V_1</italic> is the intersection between 2 vocabularies, which is mainly composed of more general terms compared with <italic>V_1/V_0</italic>: the relative complement of <italic>V_0</italic> with regard to <italic>V_1</italic>. As word vectors in <italic>V_0</italic> have already been trained for several epochs and converged to our desired values, we do not want to change them significantly. However, new words that were just introduced to the vocabulary from <italic>V_1/V_0</italic> contain domain-specific, related terms about which we care the most. Owing to the limitations in the available amount of training samples, a large learning rate at the beginning is useful to adjust these vectors to the regions they belong. Using the original skip-gram algorithm will potentially either adjust the already converged vectors from <italic>V_0</italic> away from their optimized values or the new word vectors will not get close to words that are similar to them in the vector space. To cope with this problem, we used 2 strategies in our experiments: averaged initialization as well as different learning rates.</p>
        <p>In averaged initialization, our assumption was that words in similar contexts would have similar meanings or have grammatical similarities. Although this is not always true, this strategy helps in learning the new words, as they start from an (averaged) optimized point rather than from scratch. To model this, whenever a new word appears, the initial value of the vector is set to be the averaged value in each dimension of words in the same sentence:</p>
        <disp-formula>1/S∑^{S}_{i=1} 
        <bold>v</bold>_i, where 
        <bold>v</bold>_i= 
        <bold>v</bold>, if 
        <bold>v</bold>∊ V_0,</disp-formula>
        <disp-formula>= 
        <bold>v</bold>_0, otherwise</disp-formula>
        <p>...where <italic>S</italic> is the sentence length and <bold>v</bold><italic>_0</italic> denotes the vector in <italic>V_0</italic>, which represents the vector of <italic>unknown</italic> words.</p>
        <p>During the training procedure, we have also used different learning rates for words in <italic>V_1/V_0</italic> in contrast with <italic>V_0∩V_1</italic>:</p>
        <disp-formula>α_t=α_{0_new}(1–t/n), t≠0, and 
        <bold>v</bold>∊V_1/V_0</disp-formula>
        <disp-formula>α_t=0.2, t=0, and 
        <bold>v</bold>∊V_1/V_0</disp-formula>
        <disp-formula>α_t=α_{0_old}(1–t/n), t≠0, and 
        <bold>v</bold>∊V_1∩V_0</disp-formula>
        <disp-formula>α_t=α_{0_new}(|V_1/V_0|/|V_0∪V_1|), t≠0, and 
        <bold>v</bold>∊V_1/V_0</disp-formula>
        <p>...where <italic>n</italic> is the amount of samples input to the network. For new words, the initial learning rate <italic>α_{0_new}</italic> is set to 0.2 and decreases over time. For words that are already in <italic>V_0</italic>, the initial learning rate is set to the portion of new words in the entire vocabulary: the more learning samples we have for new words, the larger the initial learning rate for old words.</p>
      </sec>
      <sec>
        <title>Transfer Learning for Sequence Labeling</title>
        <p>For sequence tagging tasks that use supervised ML, the amount and purity of training data is crucial to the performance of our system. As the complexity of learning a 40-class classifier is high, for some labels, there is only 1 case in the training set, which could not be generalized to infer good functions [<xref ref-type="bibr" rid="ref25">25</xref>]. Therefore, introducing more training data could improve the final results. However, when more training instances are not available, or are extremely costly in human labor, transfer learning, which adapts weight matrixes from functions trained with another dataset and applied to the current task, is another way to gain knowledge of labels with limited training instances.</p>
        <p>The underlying idea of transfer learning is simple: in deep NNs, there are several hidden layers between the input and output layers; as data feed forward from the input layer to the output layer, the composition of features is learned from earlier layers [<xref ref-type="bibr" rid="ref26">26</xref>]. A typical sequence tagging structure can be demonstrated as the left block in <xref ref-type="fig" rid="figure3">Figure 3</xref>: neurons in lower layers tend to capture some common, nondomain, or task-specific concepts, and later layers would concatenate these features and generate higher-level concepts. Therefore, weights learned from other datasets or even other tasks can be potentially reused as long as the structure of the later layers are consistent between the source model and target model.</p>
        <p>Several strategies of transfer learning on different NLP tasks and domains have been explored. A simple strategy is to copy all weight matrixes in the source model to the target model and fine-tune the target model with new data [<xref ref-type="bibr" rid="ref27">27</xref>], which successfully outperformed the leading team in the Multilingual Emoji Prediction task [<xref ref-type="bibr" rid="ref28">28</xref>] by 1.55% without any feature engineering procedure. However, this method requires the source and target model to have the exact same structure. An alternative strategy is to map annotations from different datasets into 3 consistent labels and use source domain model parameters directly as initializations for a target domain model in named entity recognition [<xref ref-type="bibr" rid="ref16">16</xref>]. Finally, human adjustment of rules and features [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>] or clustering labels from source domain and target domains to automatically generate label mappings from one dataset to the other [<xref ref-type="bibr" rid="ref31">31</xref>] can be applied as transfer learning strategies. However, their productivity may be limited when adapting a general source model to multiple tasks and also the generated mappings might not be satisfactory, especially when 2 datasets have dramatic differences in terms of phraseology and grammar.</p>
        <p>The method we have introduced in our study was able to transfer knowledge to a target domain that does not match labels in the source domain, does not depend on human integration during the label mapping process, and is able to map labels from very different datasets. This method follows 3 steps (<xref ref-type="fig" rid="figure3">Figure 3</xref>): the first step trains a CRF model on the source domain, the second step uses the weight matrix of the source model <italic>W^s</italic> to train a 2-layer CRF that predicts a target domain label given a source domain label, and, finally, the third step is to initialize the parameters of the target domain model using the product of <italic>W_s</italic> and the second layer weight matrix <italic>W^t</italic> obtained in step 2.</p>
        <p>First, in the source model training step, a linear-chain CRF model is trained on a large labeled source dataset. For each word <italic>x_i</italic> in a sequence <bold>x</bold>, <italic>y_i</italic> ∊ Y is the label of <italic>x_i</italic>, where Y is the label set. Then (<bold>x</bold>; <bold>y</bold>) is a sequence of word label pairs; a linear chain CRF is a distribution <italic>p</italic> (<bold>y</bold>| <bold>x</bold>) that takes the form:</p>
        <disp-formula>p(
        <bold>y</bold>|
        <bold>x</bold>) = 1/Z ∏^{L}_{l=1} (
        <bold>W</bold>^f f(y_l,
        <bold>x</bold>) + 
        <bold>W</bold>^g g(y_{l-1}, y_l))</disp-formula>
        <p>...where <italic>Z</italic> is a normalization constant, <italic>L</italic> is the length of <bold>x</bold>, <italic>f(y_l</italic>, <bold>x</bold><italic>)</italic> is a real value feature function of <bold>x</bold>, <italic>g(y_{l-1}, y_l)</italic> is a feature function of current label <italic>y_l</italic> and previous label <italic>y_{l-1}</italic> in the sequence to capture the cooccurrence between adjunct labels, and contains the parameters of the feature functions.</p>
        <p>For now, the source CRF model can be seen as an NN with 2 hidden layers: the lower layer, which is connected by <bold>W</bold>^<italic>f</italic>, and the upper layer connected by <bold>W</bold>^<italic>g</italic> (<xref ref-type="fig" rid="figure3">Figure 3</xref>). However, because the information that is captured by <bold>W</bold>^<italic>g</italic> is normally domain- and task-specific, label correlations do not always have similarities between different domains, which would thus not be suitable to be transferred to the other task. In contrast, the lower layer learned correlations between words and source labels, which is what we are interested in, are actually a logistic regression model:</p>
        <disp-formula>σ(y*, 
        <bold>x</bold>_i,
        <bold>W</bold>^f) = exp(
        <bold>W</bold>^f_{.y*}f(y*_i,
        <bold>x</bold>_i)) / ∑_{y∊Y}exp(
        <bold>W</bold>^f_{.y}f(y,
        <bold>x</bold>_i))</disp-formula>
        <p>Second, in the source label and target label correlations step, we have propagated the label probabilities from the later layer of the source domain model to another logistic regression classifier to form a 2-layer linear-chain CRF model that predicts target domain labels and uses source domain labels to learn correlations between the source and target labels. More specifically, the linear layer from the source model can be defined as:</p>
        <disp-formula>
        <bold>a</bold>_i=
        <bold>W</bold>^s f(y,
        <bold>x</bold>_i)</disp-formula>
        <p>...where each <bold>a</bold>_<italic>i</italic> is the probability for each source label and <bold>W</bold>^<italic>s</italic> denotes the weight matrix from source domain. After this layer, a linear regression classifier takes the output from <bold>a</bold>_<italic>i</italic> to predict target labels:</p>
        <disp-formula>p(y’| 
        <bold>a</bold>)=σ(y’,
        <bold>a</bold>_i; 
        <bold>W</bold>^t),</disp-formula>
        <p>...where <italic>y’</italic> is the target type. This is equal to:</p>
        <disp-formula>p(y’| 
        <bold>x</bold>)=σ(y’,
        <bold>x</bold>_i; 
        <bold>W</bold>^t
        <bold>W</bold>^s)</disp-formula>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Transfer learning model structure.</p>
          </caption>
          <graphic xlink:href="medinform_v7i2e11499_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>After <bold>W</bold>^<italic>s</italic> and <bold>W</bold>^<italic>t</italic> are trained, we were able to initialize a CRF model to predict target labels with <bold>W</bold>^<italic>f</italic> = <bold>W</bold>^<italic>t</italic> <bold>W</bold>^<italic>s</italic> as the third step:</p>
        <disp-formula>p( 
        <bold>y</bold>| 
        <bold>x</bold>)=1/Z ∏^{L}_{l=1} ( 
        <bold>W</bold>^t 
        <bold>W</bold>^s f(y_l, 
        <bold>x</bold>))</disp-formula>
        <p>During this procedure, the parameters of label NA, <bold>W</bold>^<italic>t_{NA}</italic> are reset to be zeros because the amount of instances of NA in the text corpus is much larger than that of other labels, which would cause the model to be biased to the dominant class.</p>
        <p>Theoretically, the parameters of feature functions will converge to the same weights with a random initialization model when the number of iterations is large enough because the loss function is convex. However, our aim was to inherit knowledge from the source domain, updating <bold>W</bold>^<italic>t</italic> <bold>W</bold>^<italic>s</italic> too often would cause the model to forget what it has learned so far. Therefore, early stopping and adaptive gradient algorithm (AdaGrad) [<xref ref-type="bibr" rid="ref32">32</xref>] are applied to preserve the learned source domain knowledge.</p>
      </sec>
      <sec>
        <title>Performance Evaluation and Experimental Design</title>
        <p>To compare the performance of systems, we have measured the precision, recall, and F1 (harmonic mean of precision and recall) over all categories [<xref ref-type="bibr" rid="ref33">33</xref>]. More specifically, microaveraged F1 and macroaveraged F1 are calculated. As our purpose was to emphasize on systems that perform well in all classes rather than only in the classes that have majority instances, macroaveraged F1 was selected as the main evaluation measurement.</p>
        <p>The resources used in our experiments were derived from 7 different corpora (<xref ref-type="table" rid="table1">Table 1</xref>). Among them, 3 were general English text based, 3 were specific to the English health care domain, and 1 was the test set. Details of the test set are available in <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>.</p>
        <list list-type="order">
          <list-item>
            <p>General English Corpora:</p>
            <list list-type="bullet">
              <list-item>
                <p><italic>English Wikipedia</italic> is a freely available corpus from the September 2014 version of all pages from all Wikipedia wikis. It contains more than 3 million English pages, 100 million sentences, and 3.4 billion words in total after cleaning.</p>
              </list-item>
              <list-item>
                <p><italic>University of Maryland, Baltimore County (UMBC) WebBase corpus</italic> is a dataset containing a collection of 100 million English Web pages from more than 50,000 websites with over 3 billion words processed from the February 2007 crawl by the Stanford WebBase project [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
              </list-item>
              <list-item>
                <p><italic>One Billion Words Benchmark for Language Modeling</italic> is a freely available standard corpus of 4.2 GB (0.8 billion words) for building and testing language models [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
              </list-item>
            </list>
          </list-item>
          <list-item>
            <p>Medical Domain–specific English Corpora:</p>
            <list list-type="bullet">
              <list-item>
                <p><italic>I2B2</italic> is a collection of fully deidentified clinical records provided by the I2B2 National Centre for Biomedical Computing funded by U54LM008748 and was originally prepared for <italic>Shared Tasks for Challenges in NLP for Clinical Data</italic> organized by Uzuner, I2B2, and SUNY [<xref ref-type="bibr" rid="ref36">36</xref>-<xref ref-type="bibr" rid="ref39">39</xref>].</p>
              </list-item>
              <list-item>
                <p><italic>PubMed</italic> is a free resource containing over 27 million citations to the biomedical literature and publication abstracts derived from MEDLINE, life science journals, and Web books. It was developed and is maintained by the National Centre for Biotechnology Information at the US National Library of Medicine (NLM).</p>
              </list-item>
              <list-item>
                <p><italic>PubMed Central (PMC) Open Access Subset</italic> contains over 1 million biomedical articles from PMC, which is a free archive of biomedical and life sciences journal publications at the US National Institutes of Health's NLM.</p>
              </list-item>
              <list-item>
                <p><italic>NICTA TRAIN</italic> is the NICTA Synthetic nursing handover dataset, an open clinical dataset of 3 sets of nursing handover records, very similar to real documents in Australian English. Each record consists of a patient profile, spoken free-form text document, written free-form text document, and written structured document [<xref ref-type="bibr" rid="ref40">40</xref>].</p>
              </list-item>
            </list>
          </list-item>
        </list>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Word embeddings training corpora.</p>
          </caption>
          <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="200"/>
            <col width="400"/>
            <thead>
              <tr valign="top">
                <td>Corpus</td>
                <td>Size</td>
                <td>Source</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>English Wikipedia</td>
                <td>3.4 billion words</td>
                <td>Wikimedia downloads [<xref ref-type="bibr" rid="ref41">41</xref>]</td>
              </tr>
              <tr valign="top">
                <td>UMBC<sup>a</sup></td>
                <td>&gt;3 billion words</td>
                <td>UMBC WebBase corpus [<xref ref-type="bibr" rid="ref42">42</xref>]</td>
              </tr>
              <tr valign="top">
                <td>One Billion</td>
                <td>0.8 billion words</td>
                <td>One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling [<xref ref-type="bibr" rid="ref43">43</xref>]</td>
              </tr>
              <tr valign="top">
                <td>I2B2</td>
                <td>18,082 unique words</td>
                <td>I2B2 NLP<sup>b</sup> research data sets [<xref ref-type="bibr" rid="ref6">6</xref>]</td>
              </tr>
              <tr valign="top">
                <td>PubMed</td>
                <td>27 million records</td>
                <td>PubMed resources [<xref ref-type="bibr" rid="ref44">44</xref>]</td>
              </tr>
              <tr valign="top">
                <td>PubMed Central</td>
                <td>1 million articles</td>
                <td>PubMed resources [<xref ref-type="bibr" rid="ref45">45</xref>]</td>
              </tr>
              <tr valign="top">
                <td>National Information and Communications Technology Australia Train</td>
                <td>101 records</td>
                <td>Hospital handover forms [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>UMBC: University of Maryland, Baltimore County.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>NLP: natural language processing.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Mapping of entity types between the source and target corpora.</p>
          </caption>
          <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="200"/>
            <col width="400"/>
            <thead>
              <tr valign="bottom">
                <td>In-domain source: Bolt, Beranek and Newman</td>
                <td>Out-domain source: I2B2</td>
                <td>Target: NICTA<sup>a</sup> Clinical Handover</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>PERSON</td>
                <td>PATIENT</td>
                <td>PATIENT_INTRODUCTION/Given_names</td>
              </tr>
              <tr valign="top">
                <td>PERSON</td>
                <td>PATIENT</td>
                <td>PATIENT_INTRODUCTION/Last_name</td>
              </tr>
              <tr valign="top">
                <td>PERSON</td>
                <td>DOCTOR</td>
                <td>PATIENT_INTRODUCTION/Under_Dr:Given_names</td>
              </tr>
              <tr valign="top">
                <td>PERSON</td>
                <td>DOCTOR</td>
                <td>PATIENT_INTRODUCTION/Under_Dr:Last_name</td>
              </tr>
              <tr valign="top">
                <td>PERSON</td>
                <td>DOCTOR</td>
                <td>APPOINTMENTS/Clinician:Last_name</td>
              </tr>
              <tr valign="top">
                <td>PERSON</td>
                <td>DOCTOR</td>
                <td>APPOINTMENTS/Clinician: Given_names</td>
              </tr>
              <tr valign="top">
                <td>ORGANIZATION:HOSPITAL</td>
                <td>HOSPITAL</td>
                <td>APPOINTMENTS/Hospital</td>
              </tr>
              <tr valign="top">
                <td>DATE:AGE</td>
                <td>—<sup>b</sup></td>
                <td>PATIENT_INTRODUCTION/Age_in_years</td>
              </tr>
              <tr valign="top">
                <td>PER_DESC</td>
                <td>—</td>
                <td>PATIENT_INTRODUCTION/Gender</td>
              </tr>
              <tr valign="top">
                <td>PER_DESC</td>
                <td>—</td>
                <td>APPOINTMENTS/Clinician Title</td>
              </tr>
              <tr valign="top">
                <td>GPE:CITY</td>
                <td>LOCATION</td>
                <td>APPOINTMENTS/City</td>
              </tr>
              <tr valign="top">
                <td>DATE:DATE</td>
                <td>DATE</td>
                <td>APPOINTMENTS/Day</td>
              </tr>
              <tr valign="top">
                <td>TIME</td>
                <td>—</td>
                <td>APPOINTMENTS/Time</td>
              </tr>
              <tr valign="top">
                <td>CARDINAL</td>
                <td>ID</td>
                <td>PATIENT_INTRODUCTION/Current_room</td>
              </tr>
              <tr valign="top">
                <td>CARDINAL</td>
                <td>ID</td>
                <td>PATIENT_INTRODUCTION/Current_bed</td>
              </tr>
              <tr valign="top">
                <td>PRODUCT:OTHER</td>
                <td>—</td>
                <td>Medication/Medicine</td>
              </tr>
              <tr valign="top">
                <td>SUBSTANCE:DRUG</td>
                <td>—</td>
                <td>Medication/Medicine</td>
              </tr>
              <tr valign="top">
                <td>QUANTITY:3D (volume)</td>
                <td>—</td>
                <td>Medication/Dosage</td>
              </tr>
              <tr valign="top">
                <td>QUANTITY:OTHER</td>
                <td>—</td>
                <td>Medication/Dosage</td>
              </tr>
              <tr valign="top">
                <td>QUANTITY:TEMPERATURE</td>
                <td>—</td>
                <td>My_shift/Status</td>
              </tr>
              <tr valign="top">
                <td>QUANTITY:WEIGHT</td>
                <td>—</td>
                <td>My_shift/Status</td>
              </tr>
              <tr valign="top">
                <td>SUBSTANCE:FOOD</td>
                <td>—</td>
                <td>My_shift/Input_diet</td>
              </tr>
              <tr valign="top">
                <td>FACILITY</td>
                <td>—</td>
                <td>APPOINTMENTS/Ward</td>
              </tr>
              <tr valign="top">
                <td>DISEASE</td>
                <td>—</td>
                <td>PATIENT_INTRODUCTION/Admission_reason/diagnosis</td>
              </tr>
              <tr valign="top">
                <td>DISEASE</td>
                <td>—</td>
                <td>PATIENT_INTRODUCTION/Chronic_condition</td>
              </tr>
              <tr valign="top">
                <td>DISEASE</td>
                <td>—</td>
                <td>PATIENT_INTRODUCTION/Disease/problem_history</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>NICTA: National Information and Communications Technology Australia.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Does not contain any matching label from the source domain.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>For the source domain corpus, we used 1 domain-related dataset, which contained labels that were relevant but not exactly the same as in our target domain data, and 1 of the domain corpora, which contained many of the general labels, including some labels that were relevant to biometrics. With this setup, we were keen to find out whether the parameters learned from the same domain were more valuable than those from general English.</p>
        <p>The related domain source corpus was the aforementioned I2B2. It included fully deidentified discharge summaries and progress notes from real hospital scenarios. All records had been manually annotated for concept, assertion, and relation information. The corpus contained entities of 7 different labels: PATIENT, DOCTOR, HOSPITAL, DATE, PHONE, LOCATION, and ID. They were potentially relevant to labels in our NICTA dataset.</p>
        <p>The general domain source corpus was Bolt, Beranek and Newman (BBN), which has a 1 million-word Penn Treebank corpus of Wall Street Journal texts annotated by BBN with 28 main types of entities: 12 named entity types (Person, Facility, Organization, geographical entities (GPE), Location, Nationality, Product, Event, Work of Art, Law, Language, and Contact-Info), 9 nominal entity types (Person, Facility, Organization, GPE, Product, Plant, Animal, Substance, and Disease and Game), and 7 numeric types (Date, Time, Percent, Money, Quantity, Ordinal, and Cardinal). These types were further divided into 64 subtypes [<xref ref-type="bibr" rid="ref46">46</xref>] (see <xref ref-type="table" rid="table2">Table 2</xref> for the types related to labels in the target domain).</p>
        <p>To examine what kind of word embeddings were most valuable to our task, we classified all the available datasets into 3 different groups: <italic>Group 1 General (English Wikipedia+UMBC+One Billion)</italic> was composed of general English materials, which do not contain many domain-specific words or sentences. <italic>Group 2 Biomedical literature (PubMed+PMC)</italic> was composed of biomedical literature and abstracts. Words in this group could be similar to clinical words but would be used in different ways, considering that publication writing is different from authoring clinical documents. <italic>Group 3 Clinical documents (I2B2+NICTA Train)</italic> was composed of clinical handovers, discharge summaries, and progress notes that closely resemble our task data. All corpora were preprocessed with the Stanford CoreNLP sentence splitter and tokenizer [<xref ref-type="bibr" rid="ref47">47</xref>]. Digits were replaced with NUM[Length] (eg, 08-08-1988 is replaced by NUM2-NUM2-NUM4), this method helps to capture some digit patterns such as date and phone numbers and will dramatically decrease the amount of words in vocabularies as well. To compute vector representations of word, word2vec [<xref ref-type="bibr" rid="ref48">48</xref>] was used and modified with an extra option to incrementally train word embeddings based on existing models being given new text materials. We inherited the best parameter settings for named entity recognition from a previous study [<xref ref-type="bibr" rid="ref24">24</xref>] with 200-word vector dimensions, 5 words in the context window, 10 negative samples, start with a 0.05 learning rate, and run over 20 iterations.</p>
        <p>Besides using word vectors as features, we also used a collection of hand-crafted features that were identical to our previous NICTA IE system [<xref ref-type="bibr" rid="ref25">25</xref>] for performance tracking. For each feature of 1-word instance, a unigram with a window size of 3 <italic>(w_{i-1}, w_i, w_{i+1})</italic>, and bigrams with a window size of 2 <italic>(w_{i-1}w_i, w_iw_{i+1})</italic> were used. Features used in our experiments include the lemma, part of speech tag, and parse tree, top 5 candidates and top mapping retrieved from the Unified Medical Language System (UMLS) [<xref ref-type="bibr" rid="ref49">49</xref>], medication score—derived from the Anatomical Therapeutic Chemical List, location, and frequency.</p>
        <p>To track the performance improvement on this task, the following 10 baselines were included for comparison, they are: 1) <italic>Benchmark</italic>, 2) <italic>TUC-MI-A</italic>, 3) <italic>TUC-MI-B,</italic> 4) <italic>ECNU_ICA-A,</italic> 5) <italic>ECNU_ICA-B,</italic> 6) <italic>LQRZ-A,</italic> 7) <italic>LQRZ-B,</italic> 8) <italic>Unigram NN,</italic> 9) <italic>Random,</italic> 10) <italic>Majority</italic>.</p>
        <sec>
          <title>Benchmark</title>
          <p>This was the initial NICTA benchmark system on this task using a single-layer linear-chain CRF [<xref ref-type="bibr" rid="ref50">50</xref>] with L2 regulator with the handcrafted features mentioned before as input. A detailed description of this system can be found in the study by Suominen et al [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
          <p>Participants of Conference and Labs of the Evaluation Forum (CLEF) eHealth Evaluation Lab 2016 Task 1: The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers [<xref ref-type="bibr" rid="ref51">51</xref>]. Participants were expected to train their systems using the given training set, optimize their performances using the validation set, and their final result was tested on a previous confidential test set. It should be noted that the benchmark NICTA IE system was provided to participants in the CLEF task as well as feature generators and intermediate processing results [<xref ref-type="bibr" rid="ref51">51</xref>]. Participants could start their experiments from any point based on our previous work with very little effort. In fact, all systems except a and b were started from the NICTA benchmark IE system.</p>
          <list list-type="bullet">
            <list-item>
              <p><italic>TUC-MI-A</italic> was based on our benchmark system; rather than using our default features, this method constructed a 41-feature set based on Stanford CoreNLP, latent Dirichlet allocation, regular expressions, and the ontologies of WordNet and UMLS features [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
            </list-item>
            <list-item>
              <p><italic>TUC-MI-B</italic> optimized TUC-MI-A; 19 features were selected from the whole feature set with forward and backward greedy search.</p>
            </list-item>
            <list-item>
              <p><italic>ECNU_ICA-A</italic> was a rule-based IE system to recognize bed number, room number, age, and doctor’s name and was combined with CRF results using the same feature collection with the organizers’ benchmark system [<xref ref-type="bibr" rid="ref52">52</xref>].</p>
            </list-item>
            <list-item>
              <p><italic>ECNU_ICA-B</italic> has the same system architecture as ECNU_ICA-A, except for CRF training, and a subcollection of features was used for different label types [<xref ref-type="bibr" rid="ref52">52</xref>].</p>
            </list-item>
            <list-item>
              <p><italic>LQRZ-A</italic> was a feed-forward neural network with one hidden layer initialized with uniform distribution. Inputs to this NN model are pretrained word embeddings from GoogleNews. No handcrafted features were used in this model [<xref ref-type="bibr" rid="ref53">53</xref>].</p>
            </list-item>
            <list-item>
              <p><italic>LQRZ-B</italic> firstly used a random forest to predict a subset of the tags and the previous NN to further discriminate between the remaining labels [<xref ref-type="bibr" rid="ref53">53</xref>].</p>
            </list-item>
          </list>
        </sec>
        <sec>
          <title>Unigram NN</title>
          <p>The unigram NN was an implementation of a 2-layer, first-order linear-chain graph transformer [<xref ref-type="bibr" rid="ref21">21</xref>] with handcrafted features weighted by word vectors as the first layer and a linear-chain CRF on top of it. The model was trained using AdaGrad. This is a baseline to show separately, from the multilayer NN, what is the performance gain from using word embeddings and transfer learning.</p>
        </sec>
        <sec>
          <title>Other Baselines</title>
          <p>We evaluated the task difficulty of labeling each word with 1 out of 37 classes by comparing 2 baseline systems: First, we built a system that assigns classes randomly. Second, we implemented another system that always predicts the majority class (ie, the most common class in the training set): <italic>Random</italic> to randomly select 1 class label for each instance and <italic>Majority</italic> to assign the majority class of Future_Goal/ TaskToBeCompleted/ExpectedOutcome for every instance.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Researchers worldwide have contributed to achieve a significant improvement on the clinical handover task because of a shared computational task organized in 2016 [<xref ref-type="bibr" rid="ref51">51</xref>]. In this study, we have reported the results from our experiments on the test set (<xref ref-type="table" rid="table3">Table 3</xref>) and have also taken this opportunity to overview performance improvements in the task, to summarize methods that have been used to solve the problems so far, and to inspire researchers to work further on this task. Overall, the state-of-the-art benchmark has been increased from 38.2% to 41.6% F1 (<italic>P</italic>&lt;.001; Wilcoxon text [<xref ref-type="bibr" rid="ref54">54</xref>]). Our transfer learning method using BBN as source domain (Trans_BBN) outperforms all other methods.</p>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Results of transfer learning compared with baseline systems.</p>
        </caption>
        <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="300"/>
          <col width="120"/>
          <col width="120"/>
          <col width="120"/>
          <col width="120"/>
          <col width="120"/>
          <col width="100"/>
          <thead>
            <tr valign="bottom">
              <td>Method</td>
              <td>MacPrec<sup>a</sup></td>
              <td>MacRec<sup>b</sup></td>
              <td>MacF1<sup>c</sup></td>
              <td>MicPrec<sup>d</sup></td>
              <td>MicRec<sup>e</sup></td>
              <td>MicF1<sup>f</sup></td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Trans_BBN</td>
              <td><italic>0.498</italic><sup>g</sup></td>
              <td><italic>0.419</italic></td>
              <td><italic>0.416</italic></td>
              <td>0.547</td>
              <td><italic>0.488</italic></td>
              <td>0.516</td>
            </tr>
            <tr valign="top">
              <td>Trans_I2B2</td>
              <td>0.481</td>
              <td>0.390</td>
              <td>0.392</td>
              <td>0.565</td>
              <td>0.471</td>
              <td>0.514</td>
            </tr>
            <tr valign="top">
              <td>TUC-MI-B</td>
              <td>0.493</td>
              <td>0.369</td>
              <td>0.382</td>
              <td>0.500</td>
              <td>0.505</td>
              <td>0.503</td>
            </tr>
            <tr valign="top">
              <td>ECNU_ICA-A</td>
              <td>0.493</td>
              <td>0.406</td>
              <td>0.374</td>
              <td>0.510</td>
              <td>0.522</td>
              <td>0.516</td>
            </tr>
            <tr valign="top">
              <td>General+I2B2+train</td>
              <td>0.477</td>
              <td>0.361</td>
              <td>0.354</td>
              <td><italic>0.612</italic></td>
              <td>0.483</td>
              <td><italic>0.540</italic></td>
            </tr>
            <tr valign="top">
              <td>I2B2+train</td>
              <td>0.443</td>
              <td>0.367</td>
              <td>0.354</td>
              <td>0.604</td>
              <td>0.484</td>
              <td>0.537</td>
            </tr>
            <tr valign="top">
              <td>General</td>
              <td>0.429</td>
              <td>0.356</td>
              <td>0.345</td>
              <td>0.606</td>
              <td>0.478</td>
              <td>0.535</td>
            </tr>
            <tr valign="top">
              <td>LQRZ-B</td>
              <td>0.425</td>
              <td>0.383</td>
              <td>0.345</td>
              <td>0.490</td>
              <td>0.517</td>
              <td>0.503</td>
            </tr>
            <tr valign="top">
              <td>General+PubMed+PMC</td>
              <td>0.409</td>
              <td>0.346</td>
              <td>0.334</td>
              <td>0.606</td>
              <td>0.474</td>
              <td>0.532</td>
            </tr>
            <tr valign="top">
              <td>Unigram</td>
              <td>0.393</td>
              <td>0.292</td>
              <td>0.311</td>
              <td>0.574</td>
              <td>0.448</td>
              <td>0.503</td>
            </tr>
            <tr valign="top">
              <td>TUC-MI-A</td>
              <td>0.423</td>
              <td>0.300</td>
              <td>0.311</td>
              <td>0.503</td>
              <td>0.443</td>
              <td>0.471</td>
            </tr>
            <tr valign="top">
              <td>LQRZ-A</td>
              <td>0.411</td>
              <td>0.307</td>
              <td>0.308</td>
              <td>0.563</td>
              <td>0.472</td>
              <td>0.514</td>
            </tr>
            <tr valign="top">
              <td>ECNU_ICA-B</td>
              <td>0.428</td>
              <td>0.292</td>
              <td>0.297</td>
              <td>0.581</td>
              <td>0.459</td>
              <td>0.513</td>
            </tr>
            <tr valign="top">
              <td>National Information and Communications Technology Australia</td>
              <td>0.435</td>
              <td>0.233</td>
              <td>0.246</td>
              <td>0.433</td>
              <td>0.368</td>
              <td>0.398</td>
            </tr>
            <tr valign="top">
              <td>Random</td>
              <td>0.018</td>
              <td>0.028</td>
              <td>0.019</td>
              <td>0.018</td>
              <td>0.030</td>
              <td>0.022</td>
            </tr>
            <tr valign="top">
              <td>Majority</td>
              <td>0.000</td>
              <td>0.029</td>
              <td>0.001</td>
              <td>0.016</td>
              <td>0.027</td>
              <td>0.020</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><sup>a</sup>Macro averaged precision.</p>
          </fn>
          <fn id="table3fn2">
            <p><sup>b</sup>Macro averaged recall.</p>
          </fn>
          <fn id="table3fn3">
            <p><sup>c</sup>Macro averaged F1.</p>
          </fn>
          <fn id="table3fn4">
            <p><sup>d</sup>Micro averaged precision.</p>
          </fn>
          <fn id="table3fn5">
            <p><sup>e</sup>Micro averaged recall.</p>
          </fn>
          <fn id="table3fn6">
            <p><sup>f</sup>Micro averaged F1.</p>
          </fn>
          <fn id="table3fn7">
            <p><sup>g</sup>Italics indicate the best result over the column.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>Transfer learning with I2B2 as a source model (Trans_I2B2) is also able to increase the overall macro F1 by 4.7% (<italic>P</italic>&lt;.001) compared with models using a 2-layer NN with general word embeddings (General). When using the same collection of handcrafted features, a 2-layer NN model (Unigram) performs 6.5% better than a single-layer linear-chain CRF (NICTA). The same model (Unigram) gains 3.4% improvement of macro F1 (<italic>P</italic>&lt;.001) by using word embeddings pretrained with a large text collection with general English (Wiki). Word embeddings trained with a domain-related corpus but different context (Wiki+PubMed+PMC) actually harm rather than help the result. This is possibly because although the domain-related corpus contains medical terms, which are also used in a clinical health care environment, the context of these terms is still very different from clinical handovers. On the contrary, documents used in a similar scenario (I2B2+train) show their advantage at this point. Finally, embeddings trained with a combination of I2B2+train with general English (Wiki_I2B2+train) do not help the system to increase the macro F1, but they yield the best result on micro F1.</p>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>It can be seen from the experiment results that the DL system using pretrained word representations as the input, and the proposed transfer learning technique, is able to achieve better performance.</p>
        <p>When comparing the results of different system setups on different subclasses, we observed that word representations learned from different domains and the knowledge transferred from various sources affect the clinical IE system on certain subclasses.</p>
        <p>Comparing with the best result of feature engineering methods used in TUC-MI [<xref ref-type="bibr" rid="ref9">9</xref>], our transfer learning method performs 3.4% better without a labor-costing feature-selection procedure. Furthermore, in contrast with the rule-based methods used in ECNU_ICA [<xref ref-type="bibr" rid="ref52">52</xref>], which require domain-specific experts to inspect data carefully and make the rules, our method is much more efficient and still able to achieve a 4.2% better macro F1. Finally, the best LQRZ [<xref ref-type="bibr" rid="ref53">53</xref>] used a very similar architecture with our General model, and we can see their performance is very similar as well; the minor difference is caused by different materials to train the word embeddings. Our transfer learning method is able to improve 7.1% macro F1 on top of the General model (<italic>P</italic>&lt;.001).</p>
        <p>In this section, we have analyzed these results and discussed these effects. The default measure will be the official macro F1 unless specifically mentioned otherwise.</p>
      </sec>
      <sec>
        <title>Word Representations</title>
        <p>Word embeddings trained from general English can improve the clinical IE performance. Our results show that the <italic>General</italic> model, which used exactly the same model structure and feature map as the <italic>Unigram</italic> model, except it used a combination of 3 large corpora (English Wiki, UMBC, and One Billion) to train general word embeddings, performed better on the overall task (34.5% vs 31.1%, respectively; <italic>P</italic>&lt;.001). This indicates that word representations trained from unlabeled general English text are able to capture word features that contribute to classifying different annotations in clinical handovers.</p>
        <p>Moreover, general word embeddings fine-tuned with a small task-relevant dataset can further increase the result. The model trained with I2B2 and NICTA training data <italic>(I2B2+train)</italic> outperforms the <italic>Unigram</italic> model by 4.3% and outperforms general embeddings when it is compared with the <italic>General</italic> model (35.4% vs 34.5%, respectively; <italic>P</italic>=.17).</p>
        <p>However, no evidence was found to indicate that continuing training word embeddings with a relevant dataset based on pretrained general word embeddings contributes to the system performance when comparing the <italic>I2B2+train</italic> with <italic>General+I2B2+train</italic>. This might be because although the corpora of I2B2 and NICTA training data are significantly smaller than the general English corpus, the vocabulary is still enough to cover words that are present in the test set, and after several iterations of training, word embeddings in these 2 different settings eventually converged to similar values.</p>
        <p>Word embeddings trained from domain relevant data do not show any evidence to contribute to improving the system result either. Our results showed that the <italic>General+PubMed+PMC</italic> model performed worse than the <italic>General</italic> model (33.4% vs 34.5%, respectively; <italic>P</italic>=.07). This might be because even though we considered clinical and biomedical areas as relevant, but because of having different scenarios, vocabulary and context could end up too different. This would introduce more noise to the word embeddings and so does not contribute to the IE performance.</p>
      </sec>
      <sec>
        <title>Transfer Learning</title>
        <p>Transfer learning shows its advantage in the clinical handover IE task. The top 2 systems were both transfer learning models. Transfer learning from BBN <italic>(Trans_BBN)</italic> was 3.4% higher than the previous best system <italic>TUC-MI-B</italic> (41.6% vs 38.2%, respectively; <italic>P</italic>&lt;.001).</p>
        <p>For the overall result, there is no strong evidence to show any advantage of transfer from domain-relevant source data <italic>(Trans_I2B2)</italic> over general annotations <italic>(Trans_BBN)</italic>. On the contrary, transfer learning from BBN with general annotations performed slightly better than I2B2, which contains more relevant entities with our target task on macro F1 (41.6% vs 39.2%, respectively; <italic>P</italic>&lt;.001).</p>
        <p>For subclasses, <xref ref-type="table" rid="table4">Table 4</xref> shows the results of transfer learning compared with the baseline system when the performance is improved on subclasses. When referring to <xref ref-type="table" rid="table2">Table 2</xref>:</p>
        <list list-type="order">
          <list-item>
            <p>Some subclasses where the performance is improved by transfer learning <italic>HAVE</italic> a mapping annotation type from the source domain: for example, subclass PATIENT_INTRODUCTION: Age in years has a mapping annotation DATE:AGE in the source domain BBN, and Trans_BBN on this subclass performed better than the General model (96.5% vs 94.8%, respectively; <italic>P</italic>&lt;.001). This indicates that when the target domain labels have mappings from the source domain annotations, transfer learning can improve the extraction results of these labels.</p>
          </list-item>
          <list-item>
            <p>Some subclasses where the performance is improved by transfer learning <italic>do not have</italic> a mapping annotation type from the source domain: for subclass FUTURE_CARE: Alert/waring/abnormal result, the general model was not able to predict any instance correctly, whereas transfer learning did learn some knowledge from the training set but the performance was still not very high. This might be because these subclasses may have some underlying correlations with source domain labels that are automatically learned during the second process in our method, even though the correlations were not straightforward or obvious for human readers.</p>
          </list-item>
          <list-item>
            <p>Some subclasses that have mappings from the source domains do not gain any improvement from transfer learning: for example, PATIENT_INTRODUCTION/ Given_names. These classes normally already have good performance from only using general models, so transfer learning, in this case, might introduce extra noise from other domains that potentially have different sentence structures to the target domain, and thus harm the results.</p>
          </list-item>
        </list>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Results of subclasses when transfer learning improved in the baseline system (F1 score).</p>
          </caption>
          <table width="1000" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="520"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Entity type</td>
                <td>Instances (n)</td>
                <td>General</td>
                <td>Trans_I2B2</td>
                <td>Trans_BBN</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>PATIENT_INTRODUCTION: Age (years)</td>
                <td>246</td>
                <td>0.948</td>
                <td>0.879</td>
                <td><italic>0.965</italic><sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>PATIENT_INTRODUCTION: Gender</td>
                <td>88</td>
                <td>0.826</td>
                <td>0.896</td>
                <td><italic>0.917</italic></td>
              </tr>
              <tr valign="top">
                <td>PATIENT_INTRODUCTION: Admission reason</td>
                <td>412</td>
                <td>0.214</td>
                <td>0.311</td>
                <td><italic>0.344</italic></td>
              </tr>
              <tr valign="top">
                <td>PATIENT_INTRODUCTION: Chronic condition</td>
                <td>70</td>
                <td>0.000</td>
                <td><italic>0.105</italic></td>
                <td>0.081</td>
              </tr>
              <tr valign="top">
                <td>PATIENT_INTRODUCTION: Disease/problem history</td>
                <td>147</td>
                <td>0.016</td>
                <td><italic>0.083</italic></td>
                <td>0.044</td>
              </tr>
              <tr valign="top">
                <td>PATIENT_INTRODUCTION: Care plan</td>
                <td>36</td>
                <td>0.069</td>
                <td>0.129</td>
                <td><italic>0.133</italic></td>
              </tr>
              <tr valign="top">
                <td>PATIENT_INTRODUCTION: Allergy</td>
                <td>14</td>
                <td>0.267</td>
                <td>0.500</td>
                <td><italic>0.566</italic></td>
              </tr>
              <tr valign="top">
                <td>APPOINTMENTS: Time</td>
                <td>28</td>
                <td>0.114</td>
                <td><italic>0.431</italic></td>
                <td>0.400</td>
              </tr>
              <tr valign="top">
                <td>APPOINTMENTS: Place: Ward</td>
                <td>3</td>
                <td>0.000</td>
                <td>0.000</td>
                <td><italic>0.400</italic></td>
              </tr>
              <tr valign="top">
                <td>APPOINTMENTS: Status</td>
                <td>159</td>
                <td>0.111</td>
                <td><italic>0.175</italic></td>
                <td>0.132</td>
              </tr>
              <tr valign="top">
                <td>FUTURE_CARE: Alert/warning/abnormal result</td>
                <td>59</td>
                <td>0.000</td>
                <td>0.087</td>
                <td><italic>0.178</italic></td>
              </tr>
              <tr valign="top">
                <td>FUTURE_CARE: Goal/task to be completed/expected outcome</td>
                <td>496</td>
                <td>0.000</td>
                <td>0.068</td>
                <td><italic>0.070</italic></td>
              </tr>
              <tr valign="top">
                <td>FUTURE_CARE: Discharge/transfer place</td>
                <td>89</td>
                <td>0.327</td>
                <td>0.288</td>
                <td><italic>0.361</italic></td>
              </tr>
              <tr valign="top">
                <td>MY_SHIFT: Status</td>
                <td>481</td>
                <td>0.570</td>
                <td><italic>0.688</italic></td>
                <td>0.638</td>
              </tr>
              <tr valign="top">
                <td>MY_SHIFT: Input/diet</td>
                <td>101</td>
                <td>0.413</td>
                <td>0.783</td>
                <td><italic>0.804</italic></td>
              </tr>
              <tr valign="top">
                <td>MY_SHIFT: Output/diuresis/bowel movement</td>
                <td>52</td>
                <td>0.286</td>
                <td>0.396</td>
                <td><italic>0.478</italic></td>
              </tr>
              <tr valign="top">
                <td>MY_SHIFT: Wounds/skin</td>
                <td>55</td>
                <td>0.444</td>
                <td>0.357</td>
                <td><italic>0.457</italic></td>
              </tr>
              <tr valign="top">
                <td>MY_SHIFT: Activities of daily living</td>
                <td>245</td>
                <td>0.579</td>
                <td><italic>0.753</italic></td>
                <td>0.748</td>
              </tr>
              <tr valign="top">
                <td>MY_SHIFT: Other observation</td>
                <td>361</td>
                <td>0.177</td>
                <td><italic>0.220</italic></td>
                <td>0.202</td>
              </tr>
              <tr valign="top">
                <td>MEDICATION: Medicine</td>
                <td>156</td>
                <td>0.450</td>
                <td><italic>0.548</italic></td>
                <td>0.495</td>
              </tr>
              <tr valign="top">
                <td>MEDICATION: Status</td>
                <td>68</td>
                <td>0.034</td>
                <td><italic>0.086</italic></td>
                <td>0.085</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Italics indicate the best result over the column.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study investigated adapting a DL method to extract patient information from clinical reports. Domain and task specification word representations have been used as inputs to a DL system to achieve better performance. In addition, a transfer learning model has been applied to adapt knowledge learned from general text sources to a domain-specific task. This method was able to further improve the overall result, especially in the classes related to the source domain. Domain-specific word representations improve the overall clinical IE system performance by 3.4% on macro-F1. Transferring the knowledge from a general English corpus to our task-specific domain gains a further 7.1% improvement. To our knowledge, our study is the first attempt to transfer knowledge from general deep models to specific tasks in health care and gain a significant improvement. The result of our system is state-of-the-art on this task. Our method and result point out the way toward adapting an advanced ML technique to professional informatics system tasks.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>Dataset description.</p>
        <media xlink:href="medinform_v7i2e11499_app1.pdf" xlink:title="PDF File (Adobe PDF File), 77KB"/>
      </app>
      <app id="app2">
        <title>Multimedia Appendix 2</title>
        <p>The test set of our experiments in the format of each row contains the word with its features, and the last column is the human-assigned label.</p>
        <media xlink:href="medinform_v7i2e11499_app2.zip" xlink:title="ZIP File (Zip Archive), 189KB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AdaGrad</term>
          <def>
            <p>adaptive gradient algorithm</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BBN</term>
          <def>
            <p>Bolt, Beranek and Newman</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CLEF</term>
          <def>
            <p>Conference and Labs of the Evaluation Forum</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CRF</term>
          <def>
            <p>conditional random field</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GPE</term>
          <def>
            <p>geographical entities</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">IE</term>
          <def>
            <p>information extraction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NICTA</term>
          <def>
            <p>National Information and Communications Technology Australia</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NN</term>
          <def>
            <p>neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NLM</term>
          <def>
            <p>National Library of Medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PMC</term>
          <def>
            <p>PubMed Central</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">RN</term>
          <def>
            <p>registered nurse</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SR</term>
          <def>
            <p>speech recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">UMBC</term>
          <def>
            <p>University of Maryland, Baltimore County</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">UMLS</term>
          <def>
            <p>Unified Medical Language System</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Commonwealth Department of Education and Training (The Australian National University Australian Postgraduate Award). The authors express their gratitude to Maricel Angel, RN at NICTA, for helping them to create the dataset for SR and IE.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Jordan</surname>
            <given-names>MI</given-names>
          </name>
          <name name-style="western">
            <surname>Mitchell</surname>
            <given-names>TM</given-names>
          </name>
        </person-group>
        <article-title>Machine learning: trends, perspectives, and prospects</article-title>
        <source>Science</source>  
        <year>2015</year>  
        <month>07</month>  
        <day>17</day>  
        <volume>349</volume>  
        <issue>6245</issue>  
        <fpage>255</fpage>  
        <lpage>60</lpage>  
        <pub-id pub-id-type="doi">10.1126/science.aaa8415</pub-id>
        <pub-id pub-id-type="medline">26185243</pub-id>
        <pub-id pub-id-type="pii">349/6245/255</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Miller</surname>
            <given-names>DD</given-names>
          </name>
          <name name-style="western">
            <surname>Brown</surname>
            <given-names>EW</given-names>
          </name>
        </person-group>
        <article-title>Artificial intelligence in medical practice: the question to the answer?</article-title>
        <source>Am J Med</source>  
        <year>2018</year>  
        <month>02</month>  
        <volume>131</volume>  
        <issue>2</issue>  
        <fpage>129</fpage>  
        <lpage>33</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.amjmed.2017.10.035</pub-id>
        <pub-id pub-id-type="medline">29126825</pub-id>
        <pub-id pub-id-type="pii">S0002-9343(17)31117-8</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Anderson</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>2007 National statement on ethical conduct in human research</article-title>
        <source>Intern Med J</source>  
        <year>2011</year>  
        <month>07</month>  
        <volume>41</volume>  
        <issue>7</issue>  
        <fpage>581</fpage>  
        <lpage>2</lpage>  
        <pub-id pub-id-type="doi">10.1111/j.1445-5994.2011.02528.x</pub-id>
        <pub-id pub-id-type="medline">21762341</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Hanlen</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Ferraro</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Benchmarking clinical speech recognition and information extraction: new data, methods, and evaluations</article-title>
        <source>JMIR Med Inform</source>  
        <year>2015</year>  
        <volume>3</volume>  
        <issue>2</issue>  
        <fpage>e19</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://medinform.jmir.org/2015/2/e19"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/medinform.4321</pub-id>
        <pub-id pub-id-type="medline">25917752</pub-id>
        <pub-id pub-id-type="pii">v3i2e19</pub-id>
        <pub-id pub-id-type="pmcid">PMC4427705</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Boyle</surname>
            <given-names>DI</given-names>
          </name>
          <name name-style="western">
            <surname>Rafael</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>BioGrid Australia and GRHANITE™: privacy-protecting subject matching</article-title>
        <source>Stud Health Technol Inform</source>  
        <year>2011</year>  
        <volume>168</volume>  
        <fpage>24</fpage>  
        <lpage>34</lpage>  
        <pub-id pub-id-type="doi">10.3233/978-1-60750-791-8-24</pub-id>
        <pub-id pub-id-type="medline">21893908</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="web">
        <source>I2B2</source>  
        <access-date>2018-03-07</access-date>
        <comment>NLP research data sets 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.i2b2.org/NLP/DataSets/">https://www.i2b2.org/NLP/DataSets/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6xjXRGcPN"/></comment> </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Graff</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Cieri</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <source>Linguistic Data Consortium</source>  
        <access-date>2019-01-28</access-date>
        <comment>English gigaword, linguistic data consortium 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://catalog.ldc.upenn.edu/LDC2003T05">https://catalog.ldc.upenn.edu/LDC2003T05</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="75lT1QCRw"/></comment> </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Lu</surname>
            <given-names>J J</given-names>
          </name>
          <name name-style="western">
            <surname>Ghasemzadeh</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Hayek</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Quyyumi</surname>
            <given-names>A A</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>F</given-names>
          </name>
        </person-group>
        <article-title>Effective information extraction framework for heterogeneous clinical reports using online machine learning and controlled vocabularies</article-title>
        <source>JMIR Med Inform</source>  
        <year>2017</year>  
        <month>05</month>  
        <day>9</day>  
        <volume>5</volume>  
        <issue>2</issue>  
        <fpage>e12</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://medinform.jmir.org/2017/2/e12/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/medinform.7235</pub-id>
        <pub-id pub-id-type="medline">28487265</pub-id>
        <pub-id pub-id-type="pii">v5i2e12</pub-id>
        <pub-id pub-id-type="pmcid">PMC5442348</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Leroy</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Gu</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Pettygrove</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Galindo</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Arora</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Kurzius-Spencer</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Automated extraction of diagnostic criteria from electronic health records for autism spectrum disorders: development, evaluation, and application</article-title>
        <source>J Med Internet Res</source>  
        <year>2018</year>  
        <month>11</month>  
        <day>7</day>  
        <volume>20</volume>  
        <issue>11</issue>  
        <fpage>e10497</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2018/11/e10497/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/10497</pub-id>
        <pub-id pub-id-type="medline">30404767</pub-id>
        <pub-id pub-id-type="pii">v20i11e10497</pub-id>
        <pub-id pub-id-type="pmcid">PMC6249505</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ebersbach</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Herms</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Lohr</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Eibl</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Wrappers for feature subset selection in CRF-based clinical information extraction</article-title>
        <year>2016</year>  
        <conf-name>Conference and Labs of the Evaluation Forum (CLEF)</conf-name>
        <conf-date>September 5-8, 2016</conf-date>
        <conf-loc>Evora, Portugal</conf-loc>
        <fpage>69</fpage>  
        <lpage>80</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-1609/16090069.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Collobert</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Weston</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>A unified architecture for natural language processing: Deep neural networks with multitask learning</article-title>
        <source>Proceedings of the 25th international conference on Machine learning</source>  
        <year>2008</year>  
        <conf-name>ICML'08</conf-name>
        <conf-date>July 5-9, 2008</conf-date>
        <conf-loc>Helsinki, Finland</conf-loc>
        <publisher-name>ACM</publisher-name>
        <fpage>160</fpage>  
        <lpage>167</lpage> </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Turian</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Ratinov</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Bengio</surname>
            <given-names>Y</given-names>
          </name>
        </person-group>
        <article-title>Word representations: A simple and general method for semi-supervised learning</article-title>
        <source>Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics</source>  
        <year>2010</year>  
        <conf-name>ACL'10</conf-name>
        <conf-date>July 11-16, 2010</conf-date>
        <conf-loc>Uppsala, Sweden</conf-loc>
        <publisher-name>the Association for Computational Linguistics (ACL)</publisher-name>
        <fpage>384</fpage>  
        <lpage>394</lpage> </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Baroni</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Dinu</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Kruszewski</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Don't count, predict! A systematic comparison of context-counting vs context-predicting semantic cectors</article-title>
        <year>2014</year>  
        <conf-name>ACL'14</conf-name>
        <conf-date>June 22-27, 2014</conf-date>
        <conf-loc>Baltimore, Maryland, USA</conf-loc>
        <publisher-name>Association for Computational Linguistics (ACL)</publisher-name>
        <fpage>1:238</fpage>  
        <lpage>47</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.aclweb.org/anthology/P14-1023"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Andreas</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Klein</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>How much do word embeddings encode about syntax?</article-title>
        <year>2014</year>  
        <conf-name>ACL'14</conf-name>
        <conf-date>June 22-27, 2014</conf-date>
        <conf-loc>Baltimore, Maryland, USA</conf-loc>
        <publisher-name>Association for Computational Linguistics (ACL)</publisher-name>
        <fpage>822</fpage>  
        <lpage>7</lpage> </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Qu</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Ferraro</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Hou</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Baldwin</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Named entity recognition for novel types by transfer learning</article-title>
        <year>2016</year>  
        <conf-name>2016 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
        <conf-date>November 1–5, 2016</conf-date>
        <conf-loc>Austin, Texas, USA</conf-loc>
        <publisher-name>Association for Computational Linguistics (ACL)</publisher-name>
        <fpage>899</fpage>  
        <lpage>905</lpage> </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Arnold</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Nallapati</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Cohen</surname>
            <given-names>WW</given-names>
          </name>
        </person-group>
        <article-title>Exploiting feature hierarchy for transfer learning in named entity recognition</article-title>
        <year>2008</year>  
        <conf-name>46th Annual Meeting of the Association for Computational Linguistics (ACL)</conf-name>
        <conf-date>June 15-20, 2008</conf-date>
        <conf-loc>Columbus, Ohio, USA</conf-loc>
        <publisher-name>Association for Computational Linguistics (ACL)</publisher-name>
        <fpage>245</fpage>  
        <lpage>53</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://aclweb.org/anthology/P08-1029"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <source>Kaggle</source>  
        <year>2017</year>  
        <access-date>2018-05-06</access-date>
        <comment>NICTA synthetic nursing handover data 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.kaggle.com/c/hospital-handover-forms">https://www.kaggle.com/c/hospital-handover-forms</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6zDe1dLzT"/></comment> </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Angel</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Hanlen</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <source>CSIRO Data Access Portal</source>  
        <year>2014</year>  
        <access-date>2019-01-28</access-date>
        <comment>Synthetic nursing handover training and development data set - text files 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://data.csiro.au/collections/">https://data.csiro.au/collections/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="75lrDsAGg"/></comment> </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Angel</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Hanlen</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <source>CSIRO Data Access Portal</source>  
        <year>2014</year>  
        <access-date>2019-01-28</access-date>
        <comment>Synthetic nursing handover training and development data set - audio files 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://data.csiro.au/collections/">https://data.csiro.au/collections/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="75lrAfyeb"/></comment> </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
        <source>Australian Institute of Health and Welfare</source>  
        <year>2016</year>  
        <access-date>2019-01-28</access-date>
        <comment>Chronic diseases 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aihw.gov.au/reports-data/health-conditions-disability-deaths/chronic-disease/overview">https://www.aihw.gov.au/reports-data/health-conditions-disability-deaths/chronic-disease/overview</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="75lV7EA2r"/></comment> </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Collobert</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Weston</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bottou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Karlen</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Kavukcuoglu</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Kuksa</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Natural language processing (almost) from scratch</article-title>
        <source>J Mach Learn Res</source>  
        <year>2011</year>  
        <volume>12</volume>  
        <issue>2011</issue>  
        <fpage>2493</fpage>  
        <lpage>537</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmlr.org/papers/volume12/collobert11a/collobert11a.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mikolov</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Corrado</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Dean</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>arXiv</source>  
        <year>2013</year>  
        <access-date>2019-03-20</access-date>
        <comment>Efficient estimation of word representations in vector space 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1301.3781.pdf">https://arxiv.org/pdf/1301.3781.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="770Q1SOtq"/></comment> </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mikolov</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Sutskever</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Corrado</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Dean</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Distributed representations of words and phrases and their compositionality</article-title>
        <source>Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2</source>  
        <year>2013</year>  
        <conf-name>NIPS'13</conf-name>
        <conf-date>December 5-10, 2013</conf-date>
        <conf-loc>Stateline, NV, USA</conf-loc>
        <fpage>3111</fpage>  
        <lpage>9</lpage> </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Qu</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Ferraro</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Hou</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Schneider</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Baldwin</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Big data small data, in domain out-of domain, known word unknown word: The impact of word representations on sequence labelling tasks</article-title>
        <year>2015</year>  
        <conf-name>19th Conference on Computational Natural Language Learning (CoNLL)</conf-name>
        <conf-date>July 30-31, 2015</conf-date>
        <conf-loc>Beijing, China</conf-loc>
        <fpage>83</fpage>  
        <lpage>93</lpage> </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Johnson</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Sanchez</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Sirel</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Basilakis</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Hanlen</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Estival</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Dawson</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Kelly</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Capturing patient information at nursing shift changes: methodological evaluation of speech recognition and information extraction</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2014</year>  
        <month>10</month>  
        <day>21</day>  
        <pub-id pub-id-type="doi">10.1136/amiajnl-2014-002868</pub-id>
        <pub-id pub-id-type="medline">25336589</pub-id>
        <pub-id pub-id-type="pii">amiajnl-2014-002868</pub-id></nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Schmidhuber</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Deep learning in neural networks: an overview</article-title>
        <source>Neural Netw</source>  
        <year>2015</year>  
        <month>01</month>  
        <volume>61</volume>  
        <fpage>85</fpage>  
        <lpage>117</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.neunet.2014.09.003</pub-id>
        <pub-id pub-id-type="medline">25462637</pub-id>
        <pub-id pub-id-type="pii">S0893-6080(14)00213-5</pub-id></nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Xu</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Gedeon</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>EPUTION at SemEval-2018 task 2: Emoji prediction with user adaption</article-title>
        <source>Proceedings of The 12th International Workshop on Semantic Evaluation</source>  
        <year>2018</year>  
        <conf-name>SemEval 2018</conf-name>
        <conf-date>June 5-6, 2018</conf-date>
        <conf-loc>New Orleans, Louisiana, USA</conf-loc>
        <fpage>449</fpage>  
        <lpage>53</lpage>  
        <pub-id pub-id-type="doi">10.18653/v1/S18-1071</pub-id></nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Barbieri</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Camacho-Collados</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Ronzano</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Espinosa-Anke</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Ballesteros</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Basile</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Patti</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Saggion</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>SemEval 2018 task 2: Multilingual emoji prediction</article-title>
        <source>Proceedings of The 12th International Workshop on Semantic Evaluation</source>  
        <year>2018</year>  
        <conf-name>SemEval 2018</conf-name>
        <conf-date>June 5-6, 2018</conf-date>
        <conf-loc>New Orleans, Louisiana, USA</conf-loc>
        <fpage>24</fpage>  
        <lpage>33</lpage>  
        <pub-id pub-id-type="doi">10.18653/v1/S18-1003</pub-id></nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Maynard</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Tablan</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Ursu</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Cunningham</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Wilks</surname>
            <given-names>Y</given-names>
          </name>
        </person-group>
        <article-title>Named entity recognition from diverse text types</article-title>
        <source>Proceedings of the Recent Advances in Natural Language Processing 2001 Conference</source>  
        <year>2001</year>  
        <conf-name>RANLP 2001</conf-name>
        <conf-date>September 5-7, 2001</conf-date>
        <conf-loc>Tzigov Chark, Bulgaria</conf-loc>
        <fpage>257</fpage>  
        <lpage>74</lpage> </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chiticariu</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Krishnamurthy</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Reiss</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Vaithyanathan</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Domain adaptation of rule-based annotators for named-entity recognition tasks</article-title>
        <source>Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>  
        <year>2010</year>  
        <conf-name>EMNLP'10</conf-name>
        <conf-date>October 9-11, 2010</conf-date>
        <conf-loc>Cambridge, Massachusetts</conf-loc>
        <publisher-name>Association for Computational Linguistics (ACL)</publisher-name>
        <fpage>1002</fpage>  
        <lpage>12</lpage> </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Yosinski</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Clune</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bengio</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Lipson</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>How transferable are features in deep neural networks?</article-title>
        <source>Proceedings of the 27th International Conference on Neural Information Processing Systems - Volume 2</source>  
        <year>2014</year>  
        <conf-name>NIPS'14</conf-name>
        <conf-date>December 8-13, 2014</conf-date>
        <conf-loc>Montreal, Quebec, Canada</conf-loc>
        <fpage>3320</fpage>  
        <lpage>8</lpage> </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Rizzo</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Troncy</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>NERD: A framework for unifying named entity recognition and disambiguation extraction tools</article-title>
        <source>Proceedings of the Demonstrations at the 13th Conference of the European Chapter of the Association for Computational Linguistics</source>  
        <year>2012</year>  
        <conf-name>EACL'12</conf-name>
        <conf-date>April 23-27, 2012</conf-date>
        <conf-loc>Avignon, France</conf-loc>
        <publisher-name>Association for Computational Linguistics (ACL)</publisher-name>
        <fpage>73</fpage>  
        <lpage>6</lpage> </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Pahikkala</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Salakoski</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Critical points in assessing learning performance via cross-validation</article-title>
        <source>Proceedings of the 2nd International and Interdisciplinary Conference on Adaptive Knowledge Representation and Reasoning</source>  
        <year>2008</year>  
        <conf-name>AKKR'08</conf-name>
        <conf-date>September 17-19, 2008</conf-date>
        <conf-loc>Porvoo, Finland</conf-loc>
        <publisher-name>Helsinki University of Technology</publisher-name>
        <fpage>9</fpage>  
        <lpage>22</lpage> </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Han</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Kashyap</surname>
            <given-names>A L</given-names>
          </name>
          <name name-style="western">
            <surname>Finin</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Mayfield</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Weese</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>UMBC_EBIQUITY-CORE: Semantic textual similarity systems</article-title>
        <year>2013</year>  
        <conf-name>2nd Joint Conference on Lexical and Computational Semantics (*SEM)</conf-name>
        <conf-date>2013</conf-date>
        <conf-loc>Atlanta, GA, USA</conf-loc>
        <fpage>44</fpage>  
        <lpage>52</lpage> </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chelba</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Mikolov</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Schuster</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ge</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Brants</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Koehn</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Robinson</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>One billion word benchmark for measuring progress in statistical language modeling</article-title>
        <source>Proceedings of the 15th Annual Conference of the International Speech Communication Association</source>  
        <year>2014</year>  
        <conf-name>Interspeech 2014</conf-name>
        <conf-date>September 14-18, 2014</conf-date>
        <conf-loc>Singapore</conf-loc>
        <fpage>2635</fpage>  
        <lpage>9</lpage> </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Uzuner</surname>
            <given-names>O</given-names>
          </name>
          <name name-style="western">
            <surname>Sibanda</surname>
            <given-names>T C</given-names>
          </name>
          <name name-style="western">
            <surname>Luo</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Szolovits</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>A de-identifier for medical discharge summaries</article-title>
        <source>Artif Intell Med</source>  
        <year>2008</year>  
        <month>01</month>  
        <volume>42</volume>  
        <issue>1</issue>  
        <fpage>13</fpage>  
        <lpage>35</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18053696"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.artmed.2007.10.001</pub-id>
        <pub-id pub-id-type="medline">18053696</pub-id>
        <pub-id pub-id-type="pii">S0933-3657(07)00132-7</pub-id>
        <pub-id pub-id-type="pmcid">PMC2271040</pub-id></nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Uzuner</surname>
            <given-names>O</given-names>
          </name>
          <name name-style="western">
            <surname>Luo</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Szolovits</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Evaluating the state-of-the-art in automatic de-identification</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2007</year>  
        <volume>14</volume>  
        <issue>5</issue>  
        <fpage>550</fpage>  
        <lpage>63</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://jamia.oxfordjournals.org/cgi/pmidlookup?view=long&amp;pmid=17600094"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1197/jamia.M2444</pub-id>
        <pub-id pub-id-type="medline">17600094</pub-id>
        <pub-id pub-id-type="pii">M2444</pub-id>
        <pub-id pub-id-type="pmcid">PMC1975792</pub-id></nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Uzuner</surname>
            <given-names>O</given-names>
          </name>
          <name name-style="western">
            <surname>Solti</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Xia</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Cadag</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Community annotation experiment for ground truth generation for the I2B2 medication challenge</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2010</year>  
        <volume>17</volume>  
        <issue>5</issue>  
        <fpage>519</fpage>  
        <lpage>23</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://jamia.oxfordjournals.org/cgi/pmidlookup?view=long&amp;pmid=20819855"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/jamia.2010.004200</pub-id>
        <pub-id pub-id-type="medline">20819855</pub-id>
        <pub-id pub-id-type="pii">17/5/519</pub-id>
        <pub-id pub-id-type="pmcid">PMC2995684</pub-id></nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Uzuner</surname>
            <given-names>O</given-names>
          </name>
          <name name-style="western">
            <surname>Solti</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Cadag</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Extracting medication information from clinical text</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2010</year>  
        <volume>17</volume>  
        <issue>5</issue>  
        <fpage>514</fpage>  
        <lpage>8</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20819854"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/jamia.2010.003947</pub-id>
        <pub-id pub-id-type="medline">20819854</pub-id>
        <pub-id pub-id-type="pii">17/5/514</pub-id>
        <pub-id pub-id-type="pmcid">PMC2995677</pub-id></nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Hanlen</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Evaluation data and benchmarks for cascaded speech recognition and entity extraction</article-title>
        <source>Proceedings of the Third Edition Workshop on Speech, Language &amp; Audio in Multimedia</source>  
        <year>2015</year>  
        <conf-name>SLAM'15</conf-name>
        <conf-date>October 30, 2015</conf-date>
        <conf-loc>Brisbane, Australia</conf-loc>
        <publisher-name>ACM</publisher-name>
        <fpage>15</fpage>  
        <lpage>8</lpage> </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
        <source>Wikimedia Downloads</source>  
        <access-date>2018-03-07</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://dumps.wikimedia.org/">https://dumps.wikimedia.org/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6xjWnvCVm"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
        <source>UMBC Ebiquity</source>  
        <access-date>2018-03-07</access-date>
        <comment>UMBC webbase corpus 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://ebiquity.umbc.edu/resource/html/id/351">https://ebiquity.umbc.edu/resource/html/id/351</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6xjX5evo8"/></comment> </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
        <source>Google Research</source>  
        <access-date>2018-03-07</access-date>
        <comment>One billion word benchmark for measuring progress in statistical language modeling 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://research.google.com/pubs/pub41880.html">https://research.google.com/pubs/pub41880.html</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6xjXGBpnU"/></comment> </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
        <source>NCBI</source>  
        <access-date>2019-03-19</access-date>
        <comment>PubMed 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ncbi.nlm.nih.gov/pubmed">https://www.ncbi.nlm.nih.gov/pubmed</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6xjXaRZXU"/></comment> </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
        <source>NCBI</source>  
        <access-date>2018-03-07</access-date>
        <comment>PMC 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ncbi.nlm.nih.gov/pmc/">https://www.ncbi.nlm.nih.gov/pmc/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6xjXgELxS"/></comment> </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Weischedel</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Brunstein</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <source>Linguistic Data Consortium</source>  
        <year>2005</year>  
        <comment>BBN pronoun conference and entity type corpus 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://catalog.ldc.upenn.edu/LDC2005T33">https://catalog.ldc.upenn.edu/LDC2005T33</ext-link></comment> </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Manning</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Surdeanu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Bauer</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Finkel</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bethard</surname>
            <given-names>S J</given-names>
          </name>
          <name name-style="western">
            <surname>McClosky</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>The Stanford CoreNLP natural language processing toolkit</article-title>
        <year>2014</year>  
        <conf-name>52nd Annual Meeting of the Association for Computational Linguistic system Demonstrations</conf-name>
        <conf-date>June 22-27, 2014</conf-date>
        <conf-loc>Baltimore, Maryland, USA</conf-loc>
        <fpage>55</fpage>  
        <lpage>60</lpage> </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="web">
        <source>Google Code</source>  
        <year>2013</year>  
        <comment>Tool for computing continuous distributed representations of words 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://code.google.com/archive/p/word2vec/">https://code.google.com/archive/p/word2vec/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="748ORV86C"/></comment> </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="web">
        <source>UMLS Terminology Server (and API)</source>  
        <year>2018</year>  
        <access-date>2018-11-23</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://umls.terminology.tools/">https://umls.terminology.tools/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="748Oh3vNG"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lafferty</surname>
            <given-names>JD</given-names>
          </name>
          <name name-style="western">
            <surname>McCallum</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Pereira</surname>
            <given-names>FC</given-names>
          </name>
        </person-group>
        <article-title>Conditional random fields: Probabilistic models for segmenting and labeling sequence data</article-title>
        <source>Proceedings of the Eighteenth International Conference on Machine Learning</source>  
        <year>2001</year>  
        <conf-name>ICML'01</conf-name>
        <conf-date>July 1, 2001</conf-date>
        <conf-loc>Williamstown, MA, USA</conf-loc>
        <publisher-name>Morgan Kaufmann Publishers Inc</publisher-name>
        <fpage>282</fpage>  
        <lpage>9</lpage> </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Goeuriot</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Kelly</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Task 1 of the CLEF eHealth evaluation lab 2016: Handover information extraction</article-title>
        <year>2016</year>  
        <conf-name>Conference and Labs of the Evaluation Forum (CLEF)</conf-name>
        <conf-date>September 5-8, 2016</conf-date>
        <conf-loc>Evora, Portugal</conf-loc>
        <fpage>1</fpage>  
        <lpage>14</lpage> </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Song</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Luo</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>ECNU at 2016 eHealth task 1: Handover information extraction</article-title>
        <year>2016</year>  
        <conf-name>Conference and Labs of the Evaluation Forum (CLEF)</conf-name>
        <conf-date>September 5-8, 2016</conf-date>
        <conf-loc>Evora, Portugal</conf-loc>
        <fpage>147</fpage>  
        <lpage>156</lpage> </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Quiroz</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Mennes</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Dehghani</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Distributional semantics for medical information extraction</article-title>
        <year>2016</year>  
        <conf-name>Conference and Labs of the Evaluation Forum (CLEF)</conf-name>
        <conf-date>September 5-8, 2016</conf-date>
        <conf-loc>Evora, Portugal</conf-loc>
        <fpage>109</fpage>  
        <lpage>22</lpage> </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wilcoxon</surname>
            <given-names>F</given-names>
          </name>
        </person-group>
        <article-title>Individual comparisons by ranking methods</article-title>
        <source>Biometrics</source>  
        <year>1945</year>  
        <volume>1</volume>  
        <issue>6</issue>  
        <fpage>80</fpage>  
        <lpage>3</lpage>  
        <pub-id pub-id-type="doi">10.1007/978-1-4612-4380-9_16</pub-id></nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>LeCun</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Bengio</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Hinton</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Deep learning</article-title>
        <source>Nature</source>  
        <year>2015</year>  
        <month>05</month>  
        <day>28</day>  
        <volume>521</volume>  
        <issue>7553</issue>  
        <fpage>436</fpage>  
        <lpage>44</lpage>  
        <pub-id pub-id-type="doi">10.1038/nature14539</pub-id>
        <pub-id pub-id-type="medline">26017442</pub-id>
        <pub-id pub-id-type="pii">nature14539</pub-id></nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Lehtikunnas</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Hiissa</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Back</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Karsten</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Salakoski</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Salantera</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Natural Language Processing for Nursing Documentation</article-title>
        <year>1945</year>  
        <conf-name>Proceddings of the Second International Conference on Computational Intelligence in Medicine and Health Care (CIMED), Lissabon, Portugal</conf-name>
        <conf-date>2005</conf-date>
</nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Suominen</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Lehtikunnas</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Back</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Karsten</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Salakoski</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Salantera</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Applying Language Technology to Nursing Documents: Pros and Cons with A Focus on Ethics</article-title>
        <source>International Journal of Medical Informatics</source>  
        <year>2007</year>  
        <volume>76</volume>  
        <fpage>S293</fpage>  
        <lpage>S301</lpage>  
        <pub-id pub-id-type="medline">17604685</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
