<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i12e22898</article-id>
      <article-id pub-id-type="pmid">33372893</article-id>
      <article-id pub-id-type="doi">10.2196/22898</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Extraction of Family History Information From Clinical Notes: Deep Learning and Heuristics Approach</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Wang</surname>
            <given-names>Yanshan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Benítez-Andrades</surname>
            <given-names>José Alberto</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Liu</surname>
            <given-names>Sijia</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kate</surname>
            <given-names>Rohit</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Silva</surname>
            <given-names>João Figueira</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Electronics, Telecommunications and Informatics</institution>
            <institution>Institute of Electronics and Informatics Engineering of Aveiro</institution>
            <institution>University of Aveiro</institution>
            <addr-line>IEETA - Universidade de Aveiro</addr-line>
            <addr-line>Campus Universitário do Santiago</addr-line>
            <addr-line>Aveiro, 3810-193</addr-line>
            <country>Portugal</country>
            <phone>351 234 370 500</phone>
            <email>joaofsilva@ua.pt</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5535-754X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Almeida</surname>
            <given-names>João Rafael</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0729-2264</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Matos</surname>
            <given-names>Sérgio</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1941-3983</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Electronics, Telecommunications and Informatics</institution>
        <institution>Institute of Electronics and Informatics Engineering of Aveiro</institution>
        <institution>University of Aveiro</institution>
        <addr-line>Aveiro</addr-line>
        <country>Portugal</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Information and Communications Technologies</institution>
        <institution>University of A Coruña</institution>
        <addr-line>A Coruña</addr-line>
        <country>Spain</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: João Figueira Silva <email>joaofsilva@ua.pt</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>12</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>29</day>
        <month>12</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>12</issue>
      <elocation-id>e22898</elocation-id>
      <history>
        <date date-type="received">
          <day>7</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>10</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>3</day>
          <month>11</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©João Figueira Silva, João Rafael Almeida, Sérgio Matos. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 29.12.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2020/12/e22898/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Electronic health records store large amounts of patient clinical data. Despite efforts to structure patient data, clinical notes containing rich patient information remain stored as free text, greatly limiting its exploitation. This includes family history, which is highly relevant for applications such as diagnosis and prognosis.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to develop automatic strategies for annotating family history information in clinical notes, focusing not only on the extraction of relevant entities such as family members and disease mentions but also on the extraction of relations between the identified entities.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This study extends a previous contribution for the 2019 track on family history extraction from national natural language processing clinical challenges by improving a previously developed rule-based engine, using deep learning (DL) approaches for the extraction of entities from clinical notes, and combining both approaches in a hybrid end-to-end system capable of successfully extracting family member and observation entities and the relations between those entities. Furthermore, this study analyzes the impact of factors such as the use of external resources and different types of embeddings in the performance of DL models.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The approaches developed were evaluated in a first task regarding entity extraction and in a second task concerning relation extraction. The proposed DL approach improved observation extraction, obtaining F<sub>1</sub> scores of 0.8688 and 0.7907 in the training and test sets, respectively. However, DL approaches have limitations in the extraction of family members. The rule-based engine was adjusted to have higher generalizing capability and achieved family member extraction F<sub>1</sub> scores of 0.8823 and 0.8092 in the training and test sets, respectively. The resulting hybrid system obtained F<sub>1</sub> scores of 0.8743 and 0.7979 in the training and test sets, respectively. For the second task, the original evaluator was adjusted to perform a more exact evaluation than the original one, and the hybrid system obtained F<sub>1</sub> scores of 0.6480 and 0.5082 in the training and test sets, respectively.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We evaluated the impact of several factors on the performance of DL models, and we present an end-to-end system for extracting family history information from clinical notes, which can help in the structuring and reuse of this type of information. The final hybrid solution is provided in a publicly available code repository.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>rule-based</kwd>
        <kwd>deep learning</kwd>
        <kwd>contextual embeddings</kwd>
        <kwd>word embeddings</kwd>
        <kwd>family medical history</kwd>
        <kwd>information extraction</kwd>
        <kwd>clinical notes</kwd>
        <kwd>electronic health record</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>For many years, the rapid progress in technology has continually pushed the field of medicine forward, striving for the improvement of health care quality. Novel tools provide new possibilities, such as access to new types of information (eg, medical imaging and genome sequencing) and larger amounts of data, along with associated challenges such as how to store and organize the resulting vast amounts of multimodal medical information. The electronic health record (EHR) solves this by providing an electronic infrastructure for storing structured and unstructured information generated throughout time [<xref ref-type="bibr" rid="ref1">1</xref>], thus maintaining the patient trajectories by maintaining a longitudinal view over the medical history of patients. Such data can then be explored for applications such as cohort selection [<xref ref-type="bibr" rid="ref2">2</xref>] or to provide medical entities with clinical decision support [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>Despite being harder to explore, unstructured data can contain relevant information that is not obtainable elsewhere [<xref ref-type="bibr" rid="ref6">6</xref>], which is particularly evident in clinical notes, where medical narratives allow for more accurate and complete descriptions of medical situations [<xref ref-type="bibr" rid="ref7">7</xref>]. As there is significant interest in exploring and reusing information from clinical notes, a possible approach is to process free text and extract relevant information that can be stored as structured data [<xref ref-type="bibr" rid="ref7">7</xref>]. This process has historically been manual, consisting of having clinical experts review clinical notes in search for relevant information. However, heavy reliance on a manual component greatly limits the potential and usability of this process as it cannot scale with the increasing volumes of information [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>Another possible solution for these cost and scalability issues is the development of automatic systems capable of annotating and extracting relevant content from clinical notes, which has led to greater research efforts in the field of clinical natural language processing (NLP) in the past years. These efforts have led to the creation of international challenges that provide appropriate data sets and enable performance benchmarking of new methods and solutions. The importance of these challenges is widely acknowledged because of the current lack of adequate resources [<xref ref-type="bibr" rid="ref8">8</xref>], which impedes the development of more advanced solutions [<xref ref-type="bibr" rid="ref5">5</xref>]. As such, despite the acknowledged interest and value of automated solutions, their development is very complex as it must cope with the challenging nature of working with clinical free text and with the lack of publicly available resources.</p>
        <p>Owing to the flexible nature of clinical notes, developed solutions can target the extraction of different types of information from clinical narratives. This process of extracting information is usually split in named entity recognition (NER), named entity normalization (NEN), and relation extraction (RE). NER has the objective of detecting entities of interest in the text, such as diseases or family relatives, whereas NEN is responsible for mapping these entities to normalized concepts in coding standards, such as systematized nomenclature of medicine clinical terms [<xref ref-type="bibr" rid="ref9">9</xref>] or RxNorm [<xref ref-type="bibr" rid="ref10">10</xref>] in the case of medical text. RE is focused on detecting relationships between the entities (eg, detecting connections between drugs and adverse drug events) and is very important as it allows the leap from concept extraction to concept understanding [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>This study focuses on the extraction of the family history component from clinical notes, which can provide insight into disease susceptibility and is important for the prevention, diagnosis, and treatment of specific diseases [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. A demonstration example is the work by Wang et al [<xref ref-type="bibr" rid="ref13">13</xref>] in which they used a text corpus containing 3 million clinical notes to analyze the patient family history, focusing on family members, medical problems, and their associations, and discovered (1) considerable compliance between positive and negative medical issues mentioned in the reports considering the diagnosis and family history and (2) the existence of medical problems a decade before the diagnosis dates of the determined problem. This study extends a previous contribution [<xref ref-type="bibr" rid="ref14">14</xref>] by exploring deep learning (DL) approaches for the detection of family history entities in clinical notes and integrating this component in an improved version of the previously developed solution, creating a hybrid system for extracting entities and relations from family history information. The final hybrid solution is provided in a publicly available code repository [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>The main contributions of this study are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>This study proposes a strategy to automatically annotate large amounts of EHRs, allowing quick detection of comorbidities with family relations.</p>
          </list-item>
          <list-item>
            <p>We evaluate the impact of using different DL architectures and embeddings in clinical information extraction.</p>
          </list-item>
          <list-item>
            <p>We improved the family history information extraction pipeline by combining automatic concept annotations with DL and rule-based architectures to discover entities and relations in the clinical notes.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>This study is focused on performing NER on clinical notes to extract family history information, namely, family members and observations such as disease mentions, and on detecting associations between detected entities. Correctly detecting family relatives in clinical notes is far from a straightforward task as the following situations must be considered: (1) notes frequently have cascaded information regarding family relatives (eg, “The patient’s grandmother had cancer in her late 60s [she had a cousin who died from cancer] but his grandfather has no history of cancer.”); (2) notes can mention family members with no blood relations, such as the partners of the patients and their relatives; or (3) the relationship of the family member may not be directly expressed. The existence of such situations where the relationship is complex to understand because of the numerous kinship degrees can eventually lead computational systems to lose context, failing to correctly determine the relationship between the detected entity and the patient. In contrast, disease observations can also be troublesome to detect, as, for instance, they can be mentioned as a sequence of several complex terms or even by disjoint mentions.</p>
        <p>Existing solutions typically follow rule-based or machine learning-based approaches; however, it is also possible to combine both approaches in hybrid systems. Furthermore, owing to the reckoned potential of DL approaches in the medical field [<xref ref-type="bibr" rid="ref16">16</xref>], recent years have shown the emergence of DL-based solutions [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>For many years, rule-based models were the preferred architecture when developing solutions for extracting family history information, supported by the rationale that, in theory, a good set of rules can manage good concept coverage, thus producing excellent results. Goryachev et al [<xref ref-type="bibr" rid="ref17">17</xref>] proposed a rule-based algorithm and demonstrated the success of this kind of architecture, whereas Friedlin et al [<xref ref-type="bibr" rid="ref18">18</xref>] used a rule-based model to extract and code clinical data from clinical reports.</p>
        <p>With the growing interest in the development of NLP solutions, generic frameworks such as unstructured information management application [<xref ref-type="bibr" rid="ref19">19</xref>] and general architecture for text engineering [<xref ref-type="bibr" rid="ref20">20</xref>] were created to provide support in the development of information extraction systems, from which popular solutions such as clinical text analysis and knowledge extraction system were derived [<xref ref-type="bibr" rid="ref21">21</xref>]. Despite aiming to offer modular flexible processing workflows that can be reused, these frameworks have the drawback of requiring a deep understanding of the tools given their high-level abstractions.</p>
        <p>In contrast with the previous frameworks, toolkits were developed with the goal of providing a set of stand-alone tools that can be easily combined in a processing pipeline. Examples of popular toolkits are the Natural Language Toolkit (NLTK) [<xref ref-type="bibr" rid="ref22">22</xref>], Apache OpenNLP [<xref ref-type="bibr" rid="ref23">23</xref>], Stanford CoreNLP [<xref ref-type="bibr" rid="ref24">24</xref>], and Clinical Language Annotation, Modelling and Processing [<xref ref-type="bibr" rid="ref25">25</xref>]. Despite the interest in these toolkits, they were developed considering general text instead of biomedical or clinical text, which commonly require specialized tools. Neji was developed to tackle this limitation, providing a modular architecture that integrates specialized modules for biomedical NLP. Thus, it combines the benefits of general frameworks and toolkits with those of specialized tools [<xref ref-type="bibr" rid="ref26">26</xref>]. These modules can apply different methodologies, such as rule-based models, dictionary matching, and machine learning models. Moreover, Neji provides configurable web services that enable easy integration of its annotation capabilities in external tools [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
        <p>More recently, with the success of DL approaches in text processing problems, DL is being adopted in solutions designed for biomedical and clinical text. One of the key areas where DL has impacted is representation learning, for instance, with the creation of dense representations such as word embeddings. These can be fine-tuned to specific domains and can be easily integrated in other learning algorithms, helping them achieve improved performances in NLP tasks [<xref ref-type="bibr" rid="ref28">28</xref>]. BioWordVec is an example of publicly available biomedical and clinical word embeddings [<xref ref-type="bibr" rid="ref29">29</xref>]. However, these embeddings still have the limitation of not considering context, which results in the same word having the same representation when used in completely different contexts (eg, <italic>suits</italic> in <italic>your offer suits our needs</italic> and he <italic>always wears suits</italic>). This was addressed by the development of contextual embeddings such as Embeddings from Language Models [<xref ref-type="bibr" rid="ref30">30</xref>] and bidirectional encoder representations from transformers (BERT) [<xref ref-type="bibr" rid="ref31">31</xref>]. These embeddings can also be fine-tuned to specific domains, resulting in the creation of variations such as BioBERT [<xref ref-type="bibr" rid="ref32">32</xref>] and clinicalBERT [<xref ref-type="bibr" rid="ref33">33</xref>].</p>
        <p>Embeddings are widely used in DL solutions because the resulting dense representations can be easily explored by various DL model architectures. One particular architecture that achieves state-of-the-art results in biomedical and clinical text problems such as NER is the bidirectional long short-term memory (BiLSTM) network coupled with conditional random fields (CRF). Dai et al [<xref ref-type="bibr" rid="ref34">34</xref>] compared the use of word embeddings (word2vec) and BERT for NER in clinical notes, with a BiLSTM-CRF model, and demonstrated better performance when using BERT to represent clinical text. Li et al [<xref ref-type="bibr" rid="ref35">35</xref>] used character embeddings, medical dictionaries, and part-of-speech features in a BiLSTM-Att-CRF model, which consists of a BiLSTM with an attention layer bridging the BiLSTM and CRF. This architecture was used to perform clinical NER in EHR notes, and it obtained interesting results, demonstrating the potential of attention mechanisms [<xref ref-type="bibr" rid="ref35">35</xref>]. More recently, Shi et al [<xref ref-type="bibr" rid="ref36">36</xref>] used a deep joint learning architecture based on BiLSTMs with word and part-of-speech embeddings for extracting family history information, such as entities and relations from clinical text. Although the demonstrated success of DL approaches at extracting entities and relations from clinical notes, particularly when using BiLSTM-CRF derived architectures, has led to a rapid growth in such solutions, these frequently fail to provide system implementations that hinder their adoption and reproducibility.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Set</title>
        <p>This work was originally developed under the scope of the 2019 national NLP clinical challenges (n2c2)/open health NLP track on family history extraction, which had the objective of extracting family history information from EHR clinical notes [<xref ref-type="bibr" rid="ref37">37</xref>]. This challenge track was split into 2 subtasks: the first one being oriented to named entities and the second one focusing on extracting relations between those entities. More detailed descriptions of each subtask are provided in this section. The second subtask directly depended on the first one, as the challenge had the objective of evaluating developed systems as end-to-end family history summarization solutions.</p>
        <p>Training and test data sets were provided by challenge organizers. The training data set consisted of 99 unannotated clinical notes, manual annotations of entities and relations for each clinical note, and a gold standard file with eligible entities and relations for the full training set; the test data set consisted of 117 unannotated clinical notes (a gold standard file with eligible entities and relations for the full test set was only provided after the challenge terminated). Both gold standard files contained the annotations for each document without providing any additional information (eg, annotation span or respective line in document). More detailed statistics of data sets are provided in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Detailed data set statistics.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Type</td>
                <td>Training</td>
                <td>Test</td>
                <td>Total</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">Clinical notes, n (%)</td>
                <td>99 (45.8)</td>
                <td>117 (54.2)</td>
                <td>216 (100)</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Annotated entities, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Family member</td>
                <td>667 (53.4)</td>
                <td>583 (46.6)</td>
                <td>1250 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Observation</td>
                <td>930 (50.7)</td>
                <td>906 (49.4)</td>
                <td>1836 (100)</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Annotated relations, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Family member: living status</td>
                <td>376 (51.9)</td>
                <td>349 (48.1)</td>
                <td>725 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Family member: observation</td>
                <td>740 (49.50)</td>
                <td>755 (50.50)</td>
                <td>1495 (100)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>The first subtask had the objective of identifying family member entities and disease mentions in the clinical notes. When extracting family member entities, it was required to extract both the family relationship (eg, son, father, or uncle) and the family side (eg, maternal). The list of relationships considered was provided by organizers and comprised the following: father, mother, parent, brother, sister, son, daughter, child, grandfather, grandmother, grandparent, cousin, sibling, uncle, and aunt. Any relationship outside the provided list (eg, nephew or great grandparent) should be considered invalid. Moreover, clinical notes could contain family member mentions related to the patient and to the patient’s partner. As the challenge was focused on the patient, all partner-associated family relationships should be discarded.</p>
        <p>The second subtask focused on extracting relations between the previously extracted entities and considered 2 types of relations. The first type involved detecting living status mentions, which should be used to assign a living status score to the respective family member entity. This living status score was computed by multiplying the properties of being alive and healthy, where each property could have a value from 0 to 2 (0: no, 1: not applicable, and 2: yes). The second type of relations involved assigning relations between detected disease mentions and the corresponding family members, taking into consideration if the observation was negated or not (eg, nonnegated: <italic>the patient has diabetes</italic> and negated: <italic>there are no reports of cancer</italic>).</p>
      </sec>
      <sec>
        <title>Shortest Dependency Path and Coreference Resolution</title>
        <p>The first approach, which was originally used in the challenge submission, combined handcrafted rules and dictionary matching with dependency parsing and coreference resolution. First, a preprocessing step based on Stanford CoreNLP dependency parsing and coreference resolution annotators was applied to all documents. <xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates the result of applying these annotators to an example text fragment.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Illustrative example of dependency parsing and coreference resolution from Stanford CoreNLP. amod: adjectival modifier; cop: copula; coref: coreference; det: determiner; DT: determiner; JJ: adjective; nmod: nominal modifier; NN: noun; nsubj: nominal subject, obj: object; PRP$: possessive pronoun; VBZ: verb third person singular present.</p>
          </caption>
          <graphic xlink:href="medinform_v8i12e22898_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>For the first subtask, the process of entity extraction was divided into 2 subproblems targeting family members and disease mention extraction separately. To extract family member entities, a lexicon was compiled that included all family relationships considered for the challenge, expanded with lexical variants and plural forms, along with others identified by examining an extended family tree, such as partner, great grandmother, nephew, and half-uncle. Although the latter family members should not be considered in the final evaluation, their inclusion was necessary at this stage to avoid erroneous associations with other family members during the following step.</p>
        <p>The next step consisted of coreference resolution, for which a coreference graph was created to add the corresponding family member annotations to coreferencing pronouns. Considering the example presented in <xref rid="figure1" ref-type="fig">Figure 1</xref>, the family member annotation assigned to the mention <italic>wife</italic> is carried over to the pronoun <italic>her</italic> based on the coreference relation. In the example, this also means that the <italic>maternal aunt</italic> mention gets associated to the <italic>wife</italic> family member. In addition, a process of family relationship resolution was performed by applying a set of rules to map extracted mentions to the corresponding family link, with the resulting family link inheriting the family side if it had been extracted. In the same example sentence, the aunt’s son is mapped to <italic>cousin,</italic> and this carries over the family side mention, leading to the final annotation of (wife’s) <italic>maternal cousin</italic>. Finally, the resulting list of extracted family members was filtered to remove family links other than those targeted in the challenge.</p>
        <p>The process of extracting disease mentions consisted of a simpler pipeline, in which a dictionary was first compiled from the unified medical language system Metathesaurus [<xref ref-type="bibr" rid="ref38">38</xref>]. This dictionary consisted of a filtered version of the Metathesaurus, containing entries only from the Anatomy and Disorders semantic groups, and was used to configure a Neji annotation service. Once the service was set up, all documents were annotated through the web service and a list of extracted mentions per document was created. As this annotation mechanism could introduce many irrelevant entries (false positives) resulting in a lower precision, a false positive list was created by automatically annotating the corpus provided in the SemEval task on Analysis of Clinical Text [<xref ref-type="bibr" rid="ref39">39</xref>] and identifying false positives against the gold standard annotation. The resulting false positive list was then used to filter the disease mentions extracted in the n2c2 subtask.</p>
        <p>For the second subtask, the objective was to extract 2 types of relations for the previously obtained entities. First, a small lexicon regarding living status was extracted from the training corpus, resulting in the following list: <italic>alive, alive and well, dead, deceased, died, doing well, generally healthy, good general health, good health, healthy, living, living and well, otherwise healthy, passed away, stillborn, well, and without problems</italic>. This lexicon was used to extract living status mentions from the documents, which were then mapped into an integer value using the scale previously described in the data set subsection. Finally, the dependency graph created in the first subtask was used to extract the shortest dependency path that associated each disease mention/living status with a family member. This approach disregarded the negation component in observations; therefore, all disease-family member relations were considered nonnegated.</p>
      </sec>
      <sec>
        <title>Rule-Based Engine</title>
        <p>The second approach used in the official submissions for the n2c2 challenge track followed a different strategy and consisted of a rule-based engine. This solution involved the creation of rules for family member recognition and dictionaries for observation extraction and processed both subtasks as an end-to-end system outputting the required submission files for both subtasks. After the challenge contribution, this approach was adapted and improved as described further in this section.</p>
        <p>The engine processed each sentence in a document sequentially, aiming to link sentences when one of the system processing flows did not detect family members in a sentence. Therefore, using this approach, we created a system that tried to answer the following 3 questions:</p>
        <list list-type="order">
          <list-item>
            <p>Who is the subject of the sentence?</p>
          </list-item>
          <list-item>
            <p>Which observations are in the sentence?</p>
          </list-item>
          <list-item>
            <p>Is the subject alive?</p>
          </list-item>
        </list>
        <p>Although answering these 3 questions does not entirely solve the proposed problem, managing to correctly answer them simplifies the process of establishing relations between extracted concepts. The first step in the processing flow splits the document into sentences and removes a considerable set of words. This set was composed of the most common English verbs and the most common conjugations, several adjectives, and names. This procedure preserved relevant words and reduced the distance between words that allowed the correct identification of family members and their respective family side. For instance, for a rule-based system, it is easier to find the family member <italic>cousin</italic> in the cleaned sentence <italic>patient’s uncle son</italic> than in the original sentence <italic>the patients’ uncle has one son</italic>. In this example, this could be erroneously processed as a sentence where the primary subject is the patient’s uncle, instead of the cousin.</p>
        <p>After cleaning the sentences, the system applied rules that enable the identification of the subject in the most trivial cases, using exact matching. When no subject was identified, the system processed this using another component, with more complex rules. In this case, rules have more properties such as a set of words that should exist before and after the detected family member, and if this should be discarded or not. These properties enable the generation of very precise rules, which, if used, can increase the potential of the system for the specifications of the challenge at the cost of reducing its reuse in other scenarios (ie, trade-off specificity-generalizing capability).</p>
        <p>When no family member was detected with the previous rules, the system executed another component that tried to identify if the sentence currently being processed was related to the previous sentence. If the sentence being processed was the first sentence in the document, the system considered by default that it was related to the patient. Finally, the system ran a last component, which was always executed, to discover whether the sentence was related to the patient or the patient’s partner. If the sentence was associated with the partner, the system discarded the family member entity as required by the challenge guidelines.</p>
        <p>Observation extraction consisted of a simpler process than that of family member detection. However, it followed the same principles and used the initial preprocessing for cleaning a set of words. For the challenge, we created a vocabulary based on the observations annotated in the training set and used it in the test set. Simultaneously, the system applied rules to map the detected observation to the identified subject in the sentence. When it was not possible to identify a relation in a sentence, the system did not discard the extracted observations as they were still important for the first subtask.</p>
        <p>Living status identification was performed using 2 sets of rules: one targeting deceased subjects and the other targeting healthy and alive subjects. Owing to time constraints, we did not try to identify cases where subjects were alive but not healthy because based on a statistical analysis, mentions for this group of entries represented only 12.2% (46/376) of the living status entries in the gold standard of the training set.</p>
        <p>The rule-based engine pipeline processes documents individually and sentence by sentence following a sequential flow. In this pipeline, the detected words have different levels of importance. For instance, terms like <italic>partner</italic> and <italic>patient</italic> coexisting in the same sentences are weighted differently. These weights were considered by the complementary rules during subject identification in a sentence. Disambiguation was performed using a set of verbs and specific words in situations where it was not clear whether the sentence was related to the patient, the patient's relatives, the patient’s partner, or the partner's relatives. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows an excerpt of a clinical note that illustrates clearly how the system processes original sentences and what is the result of this processing.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The 3 left concepts represent the main points that the system tries to identify in the text on the right. Highlighted on the right are relevant words for the system to be able to make decisions. Auxiliary words that help identify the subject are represented in green. The words used to identify if the relatives are related to the patient or the partner are highlighted in purple. Blue represents annotated family members, and yellow is used for diseases. Red is used to highlight words concerning subject living status.</p>
          </caption>
          <graphic xlink:href="medinform_v8i12e22898_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>This engine managed good results in the annotation of the family members of the patient. However, the methodology used to extract observations was not the best, regardless of possible improvements to produce more accurate results. Therefore, in a postchallenge contribution, we removed the components for detecting observations and improved components responsible for extracting the family members of the patient and their living status. The living status component was reused with small adjustments to be more generic and compatible with different data sets, yet maintaining the same philosophy of trying only to identify whether the patient is healthy and alive or dead.</p>
        <p>The family members annotator was rebuilt following the initial principles but without specific sets of rules that were generated from the training set of the challenge (ie, to reduce overfitting). The system pipeline is presented in a scheme (<xref rid="figure3" ref-type="fig">Figure 3</xref>) representing the system pipeline and how components are interconnected. This flow starts by trying to identify if the subject in the sentence is the patient. If not identified, the previously described complex rules are executed. The third component performs exact matching over a clean sentence for trivial annotations, and the output of these components is filtered to disambiguate relations between family members and to remove any relations that should be discarded (eg, to adhere to challenge evaluation guidelines).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Overview of the processing workflow responsible for family members detection, for the rule-based engine.</p>
          </caption>
          <graphic xlink:href="medinform_v8i12e22898_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>In the complex rules component, rules follow a 6-part structure where it is defined the keyword that triggers the rule (eg, <italic>father or grandparent</italic>), and a list of terms that must appear before or after this keyword are defined. Next, this structure contains a flag that indicates whether the annotated relative must be considered or discarded and indicates which is the detected relative. As an example, if the keyword <italic>grandparents</italic> is detected in the clean text, a rule can identify it as a paternal grandparent if there exists the set of words <italic>patients</italic> and <italic>paternal</italic>, in this order, preceding the keyword.</p>
        <p>Regarding the disambiguation component, the system contains a set of rules composed of 4 elements. These rules have 2 relatives and a mapping to the real relation of this subject to the patient. As an example, if the component annotates and processes the relatives <italic>father</italic> and <italic>brother</italic>, the system will map them to paternal uncle and return the corrected annotation. Besides the above-mentioned examples, the rule-based system contains a more extensive list of rules that were used for the processes of partial and exact match search.</p>
      </sec>
      <sec>
        <title>DL for Entity Extraction</title>
        <p>Owing to the acknowledged potential and success of recent DL solutions in clinical text problems, we extended the original contribution with a novel approach based on DL. The implementation of this solution considered several aspects, namely:</p>
        <list list-type="bullet">
          <list-item>
            <p>Following the trend in state-of-the-art solutions, we explored the widely used attention-based BiLSTM-CRF with the attention mechanism placed between the BiLSTM and CRF layers [<xref ref-type="bibr" rid="ref35">35</xref>] and compared it with a simple linear classifier (with softmax) to evaluate the impact of model architecture in downstream tasks.</p>
          </list-item>
          <list-item>
            <p>Similar to the approach presented by Yang et al [<xref ref-type="bibr" rid="ref40">40</xref>], an additional task regarding named entity discovery was integrated with the objective of improving model perception of unknown entities. This downstream task was set as optional; thus, it is possible to train models for NER and for NER and discovery.</p>
          </list-item>
          <list-item>
            <p>Different types of embeddings were explored for clinical text representation to assess their impact on model performance. BioWordVec word embeddings and clinicalBERT contextual embeddings were used.</p>
          </list-item>
          <list-item>
            <p>To evaluate the impact of using external resources in model development, Neji annotations were integrated into the input to the model.</p>
          </list-item>
        </list>
        <p>A schematized view of the model architecture used in this study (attention-based BiLSTM-CRF) is presented in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Schematic diagram of the general deep learning model architecture used in this study, showing the 2 possible downstream tasks. The entity recognition task is always executed, whereas the entity discovery task was added as optional to enable model development with and without it. BiLSTM: bidirectional long short term memory; B-Obs: beginning observation; B-PFM: beginning patient family member; CRF: conditional random field; I-Obs: inside observation; n: number of tokens in tokenized sentence; O: outside.</p>
          </caption>
          <graphic xlink:href="medinform_v8i12e22898_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The named entity discovery downstream task consists of a binary classification problem where the system classifies whether an input token is part of an entity or not, disregarding the respective class (ie, if it is an observation or family member mention). This optional task was integrated with the objective of making the model consider the trade-off between discovering more entities and correctly identifying them. When enabled, it is reflected in model training during backpropagation, with the total loss resulting from a linear combination between the losses of both downstream tasks.</p>
        <p>Before training any model, it was necessary to preprocess the data set. Text preprocessing began by splitting each document in sentences using the sentence splitter from NLTK, followed by tokenization. However, 2 different tokenization methods had to be used because word and contextual embeddings take different tokenizing approaches: the NLTK word tokenizer was used for word embeddings, and the BERT tokenizer was used for contextual embeddings. The resulting tokenized sentences were tagged using the BIO (beginning, inside, and outside) tagging scheme. Finally, to assess the impact of using external resources, all documents were annotated using Neji, which uses standard vocabularies to detect entity mentions in the input text. Neji annotations, consisting of text spans and entity classes, were then mapped to the tokens in the corresponding sentence, with each token being assigned an integer value similar to the BIO scheme: 0 for tokens not annotated by Neji, 1 for the first token in an annotation, and 2 for the following tokens. The resulting lists of classes were normalized and concatenated to the embedding representations and then forwarded through the BiLSTM layer.</p>
        <p>Model training and evaluation were performed using 5-fold cross validation. The Adam optimizer was used, and models were trained with early stopping (the patience parameter can be adjusted). Each training epoch consisted of 100 iterations, during which the training partition was randomly sampled. A detailed list of hyperparameters is provided in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>List of hyperparameters used for deep learning model training.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="800"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Hyperparameters</td>
                <td>Value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Dimension of BioWordVec embeddings</td>
                <td>200</td>
              </tr>
              <tr valign="top">
                <td>Dimension of clinicalBERT<sup>a</sup> embeddings</td>
                <td>768</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM<sup>b</sup> hidden size</td>
                <td>256</td>
              </tr>
              <tr valign="top">
                <td>Number of attention heads</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>Epochs</td>
                <td>100</td>
              </tr>
              <tr valign="top">
                <td>Patience</td>
                <td>5</td>
              </tr>
              <tr valign="top">
                <td>Iterations per epoch</td>
                <td>100</td>
              </tr>
              <tr valign="top">
                <td>Dropout rate</td>
                <td>0.5</td>
              </tr>
              <tr valign="top">
                <td>Learning rate</td>
                <td>0.001</td>
              </tr>
              <tr valign="top">
                <td>Batch size</td>
                <td>32</td>
              </tr>
              <tr valign="top">
                <td>Epochs for training BioWordVec embeddings</td>
                <td>2</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>clinicalBERT: clinical bidirectional encoder representations from transformers.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>BiLSTM: bidirectional long short-term memory.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In addition, because contextual embeddings provide additional information when compared with word embeddings, we enabled the training of word embeddings for a number of epochs at the beginning of model training, after which the embedding layer was frozen. Finally, as contextual embeddings can partition words in various smaller tokens (eg, <italic>carcinoma</italic> is split in <italic>car,</italic> <italic>##cin,</italic> and <italic>##oma)</italic>, the model could classify only parts of a word as entities (eg, <italic>##cin</italic> and <italic>##oma</italic> classified as entities and <italic>car</italic> as nonentity), resulting in incomplete entities and poor results. Therefore, we added a reconstruction mechanism where the full word is considered when only a part of it is classified as an entity.</p>
        <p>The DL approach obtained interesting results in observation extraction but poor performance in family member detection, which goes in contrast with the rule-based approach. As such, we created a final hybrid solution that integrates the DL approach as an observation extraction module in the rule-based engine.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The original contribution consisted of the development of 2 different approaches for entity and RE: one using shortest dependency paths combined with coreference resolution and another using a rule-based engine. These approaches were validated in the n2c2 challenge on family history extraction. Results obtained in the test data set (<xref ref-type="table" rid="table3">Table 3</xref>) showed that overall, the first approach performed better in the entity extraction subtask, whereas the rule-based approach performed better in the RE subtask.</p>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Original overall test results for the 2 national natural language processing clinical challenges subtasks; approach 1: shortest dependency path and coreference resolution and approach 2: rule-based engine.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="370"/>
          <col width="200"/>
          <col width="200"/>
          <col width="200"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Subtasks and approach</td>
              <td>Precision</td>
              <td>Recall</td>
              <td>F<sub>1</sub> score</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="5">
                <bold>Subtask 1</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Approach 1</td>
              <td>0.6501</td>
              <td>0.8892</td>
              <td>0.7510</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Approach 2</td>
              <td>0.8507</td>
              <td>0.6211</td>
              <td>0.7180</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Subtask 2</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Approach 1</td>
              <td>0.5406</td>
              <td>0.5005</td>
              <td>0.5198</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Approach 2</td>
              <td>0.6468</td>
              <td>0.5992</td>
              <td>0.6221</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <p>As the results obtained during the challenge had margins for improvement, and DL-based approaches dominated system submissions in the challenge, we opted to experiment with DL to improve the previous contribution. For the sake of simplicity, tables presenting DL-related results only contain F<sub>1</sub> score values. However, more detailed results (including precision and recall metrics) are presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      <p>For the DL-based approach, we started by testing a simple model configuration composed of a linear layer and a softmax function, using contextual embeddings for clinical text representation (<xref ref-type="table" rid="table4">Table 4</xref>). This simple model served as a reference point to assess the potential of using contextual embeddings to represent clinical text.</p>
      <table-wrap position="float" id="table4">
        <label>Table 4</label>
        <caption>
          <p>Cross validation results on the training data set (5-fold cross validation) for subtask 1 using a deep learning model composed of clinical bidirectional encoder representations from transformers embeddings, a linear layer, and softmax function, with and without token reconstruction. For simplicity purposes, only F1 scores are presented.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="370"/>
          <col width="200"/>
          <col width="200"/>
          <col width="200"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Reconstruction approach and model configuration</td>
              <td>Family member</td>
              <td>Observations</td>
              <td>Overall</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="5">
                <bold>No reconstruction</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline</td>
              <td>0.3071</td>
              <td>0.6620</td>
              <td>0.5647</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED<sup>a</sup></td>
              <td>0.1764</td>
              <td>0.6397</td>
              <td>0.5204</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+Neji</td>
              <td>0.3088</td>
              <td>0.7019</td>
              <td>0.5924</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED+Neji</td>
              <td>0.1840</td>
              <td>0.6841</td>
              <td>0.5523</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Reconstruction</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline</td>
              <td>0.3071</td>
              <td>0.7444</td>
              <td>0.6241</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED</td>
              <td>0.1764</td>
              <td>0.7142</td>
              <td>0.5753</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+Neji</td>
              <td>0.3088</td>
              <td>0.7712</td>
              <td>0.6418</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED+Neji</td>
              <td>0.1840</td>
              <td>0.7593</td>
              <td>0.6070</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table4fn1">
            <p><sup>a</sup>ED: entity discovery.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>After testing with a simple architecture and evaluating the impact of adding an entity discovery downstream task and external resources to the model, we proceeded to the more complex architecture of the attention-based BiLSTM-CRF, which has been widely explored in the state of the art. This architecture was first tested using contextual embeddings for text representation to assess the impact of model capacity on the resulting model performance (<xref ref-type="table" rid="table5">Table 5</xref>). After observing the improvements resulting from the change in model architecture, we then evaluated the influence of the embeddings used in the final system results by training the same architecture with word embeddings (<xref ref-type="table" rid="table5">Table 5</xref>). As word embeddings capture less information than their contextual counterpart, we integrated the possibility of fine-tuning word embeddings for a number of epochs at the beginning of the training process, freezing the embeddings after that point.</p>
      <table-wrap position="float" id="table5">
        <label>Table 5</label>
        <caption>
          <p>Cross validation results on the training data set (5-fold cross validation) for subtask 1 using the attention-based bidirectional long short-term memory network coupled with conditional random fields with different types of embeddings. When using word embeddings, some configurations enabled embedding fine-tuning for 2 epochs. For simplicity purposes, only F1 scores are presented.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="370"/>
          <col width="200"/>
          <col width="200"/>
          <col width="200"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Embeddings type and model configuration</td>
              <td>Family member</td>
              <td>Observations</td>
              <td>Overall</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="5">
                <bold>clinicalBERT<sup>a</sup></bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline</td>
              <td>0.4103</td>
              <td>0.8596</td>
              <td>0.7194</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED<sup>b</sup></td>
              <td>0.3788</td>
              <td>0.8481</td>
              <td>0.7023</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+Neji</td>
              <td>0.3545</td>
              <td>0.8478</td>
              <td>0.6908</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED+Neji</td>
              <td>0.3485</td>
              <td>0.8688</td>
              <td>0.7081</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>BioWordVec</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline</td>
              <td>0.5921</td>
              <td>0.8140</td>
              <td>0.7317</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED</td>
              <td>0.6553</td>
              <td>0.8276</td>
              <td>0.7627</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ET<sup>c</sup></td>
              <td>0.6166</td>
              <td>0.8285</td>
              <td>0.7513</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED+ET</td>
              <td>0.6219</td>
              <td>0.8367</td>
              <td>0.7579</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED+Neji</td>
              <td>0.7222</td>
              <td>0.8529</td>
              <td>0.8036</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Baseline+ED+ET+Neji</td>
              <td>0.7266</td>
              <td>0.8587</td>
              <td>0.8092</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table5fn1">
            <p><sup>a</sup>clinicalBERT: clinical bidirectional encoder representations from transformers.</p>
          </fn>
          <fn id="table5fn2">
            <p><sup>b</sup>ED: entity discovery.</p>
          </fn>
          <fn id="table5fn3">
            <p><sup>c</sup>ET: embeddings training.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>Although the use of a more complex model architecture provided promising results, there was a common trend among all used models, which was the fact that these approaches performed much better at extracting observations than family members.</p>
      <p>Considering the fact that the rule-based engine struggled in observation extraction while obtaining good performance in family member extraction [<xref ref-type="bibr" rid="ref14">14</xref>] and that it performed better in the RE subtask than the shortest dependency path approach, we created a hybrid system that complements the rule-based engine by adding a DL module responsible for extracting disease mentions. <xref ref-type="table" rid="table6">Table 6</xref> presents the results obtained with the hybrid solution in the test data set.</p>
      <table-wrap position="float" id="table6">
        <label>Table 6</label>
        <caption>
          <p>Test results for both subtasks using the final hybrid solution: rule-based engine combined with deep learning module for observation extraction.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="370"/>
          <col width="200"/>
          <col width="200"/>
          <col width="200"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Subtask and annotation type</td>
              <td>Precision</td>
              <td>Recall</td>
              <td>F<sub>1</sub> score</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="5">
                <bold>Subtask 1</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Family members</td>
              <td>0.7887</td>
              <td>0.8307</td>
              <td>0.8092</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Observations</td>
              <td>0.7523</td>
              <td>0.8332</td>
              <td>0.7907</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Overall</td>
              <td>0.7662</td>
              <td>0.8322</td>
              <td>0.7979</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Subtask 2</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Living status</td>
              <td>0.5964</td>
              <td>0.6462</td>
              <td>0.6248</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Observations</td>
              <td>0.4635</td>
              <td>0.4371</td>
              <td>0.4499</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Overall</td>
              <td>0.5100</td>
              <td>0.5063</td>
              <td>0.5082</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <sec>
          <title>DL for Entity Extraction</title>
          <p>Word embeddings have been the go-to method for text representation in the past years. However, contextual embeddings have made a big impact in recent years as they consider positional information and context in the resulting representation, which provides them with higher disambiguation capability than that of word embeddings. As such, our initial tests were performed using publicly available contextual embeddings fine-tuned on biomedical and clinical corpora.</p>
          <p>First, we analyzed the impact of reconstructing annotated tokens on the resulting performance. Tests with a simple model (<xref ref-type="table" rid="table4">Table 4</xref>) showed improved performance in every model configuration when using token reconstruction. However, it is noticeable that only observation extraction benefited from this process, with family member extraction maintaining its F<sub>1</sub> scores. This is explained by the fact that disease mentions can be very specific and more complex when compared with family members, for instance, the word <italic>mother</italic> is tokenized by the contextual embedding tokenizers as <italic>mother</italic>, whereas <italic>carcinoma</italic> is tokenized as <italic>car, ##cin, and ##oma</italic>. Owing to this different word decomposition, the DL model can classify only parts of the word as an entity, resulting in incomplete entities. The reconstruction procedure solved this issue by adding the missing parts to these entities. Tests with the simple model also demonstrated that the use of external resources such as Neji annotations can help improve entity extraction, whereas adding an additional downstream task regarding entity discovery led to worse results with this model. Finally, it was clear that the model managed to extract disease mentions from clinical notes but failed in the detection of family members, leading to lower overall F<sub>1</sub> scores.</p>
          <p>After performing the initial tests with a simple model and verifying the importance of token reconstruction when using contextual embeddings, we moved to the more complex architecture of the attention-based BiLSTM-CRF (<xref ref-type="table" rid="table5">Table 5</xref>). To be able to compare it with the previous model, we began by testing the new model with contextual embeddings. Starting with baseline models, it is possible to see that changing to the higher capacity model increased F<sub>1</sub> scores by approximately 0.1 across all categories. Next, it is possible to observe that complementing the baseline model with the entity discovery task and Neji resources resulted in worse overall F<sub>1</sub> scores; nonetheless, their combination led to an increase in the F<sub>1</sub> score for observation extraction (0.8596 to 0.8688).</p>
          <p>Finally, to evaluate the influence of using different types of embeddings to represent clinical text, we tested the same model architecture with publicly available word embeddings fine-tuned on biomedical and clinical corpora. Comparing baseline models, word embeddings led to a higher overall performance (0.7317 vs 0.7194), lowering the observation extraction F<sub>1</sub> score but improving that of family member extraction. Adding extra mechanisms such as external annotations and entity discovery progressively increased model performance, with the final model showing a much higher overall F<sub>1</sub> score compared with the best contextual embedding configuration (0.8092 vs 0.7194). This higher overall performance was caused by a significant increase in the family member F<sub>1</sub> score (0.4103 to 0.7266), although observation extraction decreased from 0.8688 to 0.8587 F<sub>1</sub> score.</p>
          <p>The previous results demonstrated that despite the increasing focus on contextual embeddings, word embeddings can obtain good results when using state-of-the-art model architectures. In spite of its much better performance in family member extraction, the word embedding model still obtained subpar performance when compared with the rule-based engine in the same task (0.7266 vs 0.8823). As the objective was to integrate the best approach for observation extraction in the rule-based engine, and contextual embeddings obtained the upper hand in that aspect (0.8688 to 0.8587), we integrated the attention-based BiLSTM-CRF with clinicalBERT embeddings in the hybrid system.</p>
        </sec>
        <sec>
          <title>Hybrid System</title>
          <p>The original rule-based system was developed focusing on the n2c2 challenge and contained sets of rules that were adjusted to the training set. These rules were removed after the challenge, whereas other existing rules were carefully adjusted to create a better system that retained its generalizing capabilities.</p>
          <p>With the objective of exploring the best developed approaches for each component of the subtasks, we based the final system on the improved rule-based engine and substituted its weaker component (observation extraction) by a DL-based module. The result was a hybrid system capable of extracting family members and observations along with their respective relations.</p>
          <p>As experienced in the original contribution, the results obtained in the test set showed a decrease in performance (<xref ref-type="table" rid="table6">Table 6</xref>), presenting an overall F<sub>1</sub> score of 0.7979 in subtask 1 and an overall F<sub>1</sub> score of 0.5082 in subtask 2. For the first subtask, the hybrid system showed an improvement from the previous best result of 0.7510 overall F<sub>1</sub> to 0.7979 (a 4.69 percentage point increase). Regarding the RE subtask, although the overall F<sub>1</sub> score decreased from 0.6221 to 0.5082, there are 2 aspects that should be considered. The first aspect is that adjustments were made to the rule-based engine, which reduced the specificity of its rules and impacted the challenge performance. The second one is that results presented for subtask 2 were obtained using a modified version of the evaluator. The adjusted evaluator performs a more exact analysis of the system output, resulting in lower performance values compared with the original counterpart. A more detailed explanation of this last aspect is provided in the following subsection of <italic>Evaluation and Error Analysis</italic>.</p>
        </sec>
      </sec>
      <sec>
        <title>Evaluation and Error Analysis</title>
        <p>The annotations resulting from the approaches described were evaluated using precision, recall, and F<sub>1</sub> score metrics. The items considered in subtask 1 evaluation were the patient family members combined with their family side and the observations in each document. Regarding family members, if the system does not properly extract relatives’ family side, the results are considered a false positive and a false negative. However, in the case of observations, the evaluator was more flexible. More specifically, if observations were partially annotated (eg, for the observation <italic>diabetes type 2,</italic> the system extracted only <italic>diabetes</italic>), the evaluator considered a true positive. This evaluator was provided by the n2c2 organizers, and we maintained its principles.</p>
        <p>The evaluation process for the RE subtask considered (1) the attribution of living status to family members, with correct family side, and (2) the association of observations to family members, including the indication of whether the observation was negated or not. The original evaluator considers each family member, observation, and negation status triple correctly identified by a system. However, the evaluator considers it as a true positive if only the observation or only the negation status were correctly extracted for a given relative. This formulation produces additional true positives, even for annotations that are not completely correct. Therefore, we changed the behavior of this evaluator to consider as true positive only when the system correctly extracted the family member, the respective family side, the (possibly partial) observation, and the observation logical status, as we believe that the extraction is more useful if it is completely correct. As an effect of this change, the F<sub>1</sub> scores of our challenge submission reduced approximately 10 percentage points when compared with the official results. For instance, when using the new evaluator, approach 1 reduced its F<sub>1</sub> score from 0.5198 to 0.4431, whereas approach 2 decreased its F<sub>1</sub> score from 0.6221 to 0.4818.</p>
        <p>To understand what affects our results, we randomly selected some false positives and performed a manual analysis on the training set. This analysis led to the detection of inconsistencies in the gold standard annotations, which adversely affected the performance of our system. For instance, in the same clinical notes, 2 identical sentences regarding different family members were annotated with different living statuses. Another example was that at least 14 relatives without living status were annotated when this was present in the gold standard raw data. This raw data consists of the XML files supplied along with the clinical notes in the training set, which were the base of the submission gold standard file. In some of the clinical notes, we detected observations that were present in the text but not annotated in the gold standard and observations that were detected and present in subtask 1 gold standard but not attributed to any subject (despite having the family member also annotated in the gold standard). Although we were not able to perform an in-depth analysis and assess how much this affected our scores, the identified inconsistencies had some impact on performance.</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>The resulting system was built to be more generic than the previous version, which was used in the n2c2 challenge. Despite the improvements made to the system, there are still some limitations. <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> presents some sentences extracted from the clinical notes that are representative examples of the system limitations.</p>
        <boxed-text id="box1" position="float">
          <title>Analyses of some of the false positives and false negatives classified by the proposed system. Family member annotations are emphasized in the sentence using italics.</title>
          <p>Child not applicable (N/A)</p>
          <p>“Mr. Smith’s father suffers from cancer. He has several <italic>children</italic> through several other women...”</p>
          <p>Daughter N/A</p>
          <p>“The maternal/paternal great-aunt that has diabetes had several children. One of these individuals had a cancer of an unknown type and is deceased. The second <italic>daughter</italic> is the individual with diabetes type 2...”</p>
          <p>Parent N/A</p>
          <p>“John’s <italic>parents</italic> are both reportedly healthy at age 63, but they have not seen a physician in approximately 30 years. John’s mother had one second trimester miscarriage...”</p>
          <p>Sibling N/A</p>
          <p>“Saul’s father is a 39-year-old man who is a college graduate and who has a total of 5 <italic>siblings</italic>...”</p>
          <p>Grandparents N/A</p>
          <p>“While living in Texas, they lived with extended family, including Peter’s <italic>grandparents</italic>...”</p>
        </boxed-text>
        <p>The first example of these limitations concerns the establishment of incorrect sentence connections in certain situations. Depending on the scenario, in the first sentence in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>, it could be annotated <italic>child</italic> or <italic>sibling</italic>, as it is influenced by the order in which rules are applied during family members detection. However, in this example, the pronoun <italic>he</italic> refers to the patient’s father. Thus, the mentioned children are patient’s half-siblings, a relative that should not be considered according to the guidelines.</p>
        <p>The problem in the second example is also related to sentence linking. The system detects a daughter because it loses the sentence context. In addition, the existence of <italic>maternal/paternal</italic> before a relative led to inconsistencies in the detection because there are no rules for these situations. Despite all those problems, the relative annotated as daughter is in fact a third-degree cousin, a relationship that should not be considered. The third and fourth examples show other cases where there was an incorrect family member annotation because of the system losing context within the sentences.</p>
        <p>The final example is a special case because the annotation was correctly performed but was not considered in the gold standard annotations, as the clinical notes did not provide any clinical information about the relative. Moreover, the clinical information regarded as necessary for annotating a relative mention is not exclusively composed of observations and may comprehend other types of information such as medication intake or medical procedures, which invalidates the possibility of filtering such situations based on observation associations alone.</p>
        <p>Although these might not be the only problems, the limitations presented were those that stood out the most. This led us to analyze possible future work for this contribution, which we could split in different topics. First, we need to test this system in another data set, with a more solid gold standard. This will help us understand the performance of the system as well as its versatility in detail. Another task is the extension of the clinical information extracted. The current version has models designed to extract observations. However, we intend to build other models to extract drugs and procedures, among other medical categories that were not required in the challenge. This extension would lead to a reformulation of the detection of patient’s relatives and allow filtering mentions with no medical information, such as the last example in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>). Finally, there is also the possibility of exploring machine learning and DL for the process of establishing relations between extracted entities.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We present an extension to a previous work that focused on extracting family history information from clinical notes. Specifically, we developed a more generic system and improved the previous F<sub>1</sub> score in the entity extraction subtask by approximately 5 percentage points by combining different approaches. Although the rule-based engine succeeded in extracting patient relatives because of the range of possibilities in the text, this approach failed in the detection of observations. However, the use of DL models helped rectify this gap, with the hybrid system taking advantage of the best characteristics of these 2 methodologies. The hybrid solution is provided in a publicly available code repository.</p>
        <p>This study promotes new strategies to easily annotate large amounts of clinical reports currently available in EHR systems. If these reports were annotated and indexed, it would be simpler for a clinician to search for reports mentioning specific concepts. In addition, with data in a structured format, this information can be reused in other scenarios, such as predicting the patient’s susceptibility or predisposition to diseases.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Detailed results for all deep learning model configurations tested in this work. Precision, recall, and F1-scores are provided separately for family member and observation extraction along with overall results.</p>
        <media xlink:href="medinform_v8i12e22898_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 72 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BiLSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BIO</term>
          <def>
            <p>beginning, inside, and outside</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CRF</term>
          <def>
            <p>conditional random fields</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">n2c2</term>
          <def>
            <p>national NLP clinical challenges</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">NEN</term>
          <def>
            <p>named entity normalization</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NLTK</term>
          <def>
            <p>Natural Language Toolkit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">RE</term>
          <def>
            <p>relation extraction</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the European Union/European Federation of Pharmaceutical Industries and Associations Innovative Medicines Initiative 2 Joint Undertaking under grant agreement No 806968 and by the NEw Targets in DIAstolic heart failure: from coMOrbidites to persoNalizeD medicine (NETDIAMOND) project (POCI-01-0145-FEDER-016385), cofunded by Centro 2020 program, Portugal 2020, European Union. JS and JA are supported by Foundation for Science and Technology (national funds), under the grant numbers PD/BD/142878/2018 and SFRH/BD/147837/2019, respectively.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Katehakis</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Tsiknakis</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Electronic health record</article-title>
          <source>Wiley Encyclopedia of Biomedical Engineering</source>
          <year>2006</year>
          <publisher-loc>New Jersey, United States</publisher-loc>
          <publisher-name>John Wiley &#38; Sons</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Antunes</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Silva</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Pereira</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Matos</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Rule-based and Machine Learning Hybrid System for Patient Cohort Selection</article-title>
          <source>Proceedings of the 12th International Joint Conference on Biomedical Engineering Systems and Technologies - Volume 5</source>
          <year>2019</year>
          <conf-name>HEALTHINF'19</conf-name>
          <conf-date>February 22-24, 2019</conf-date>
          <conf-loc>Prague, Czech</conf-loc>
          <fpage>59</fpage>
          <lpage>67</lpage>
          <pub-id pub-id-type="doi">10.5220/0007349300590067</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kukafka</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ancker</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chelico</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mortoti</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Presley</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Stephens</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Redesigning electronic health record systems to support public health</article-title>
          <source>J Biomed Inform</source>
          <year>2007</year>
          <month>08</month>
          <volume>40</volume>
          <issue>4</issue>
          <fpage>398</fpage>
          <lpage>409</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(07)00060-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2007.07.001</pub-id>
          <pub-id pub-id-type="medline">17632039</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(07)00060-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Oliveira</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>GenericCDSS - A Generic Clinical Decision Support System</article-title>
          <source>2019 IEEE 32nd International Symposium on Computer-Based Medical Systems (CBMS)</source>
          <year>2019</year>
          <conf-name>CBMS'19</conf-name>
          <conf-date>June 5-7, 2019</conf-date>
          <conf-loc>Córdoba, Spain</conf-loc>
          <fpage>186</fpage>
          <lpage>91</lpage>
          <pub-id pub-id-type="doi">10.1109/cbms.2019.00046</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sheikhalishahi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Miotto</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Lavelli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rinaldi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Osmani</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing of clinical notes on chronic diseases: systematic review</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>04</month>
          <day>27</day>
          <volume>7</volume>
          <issue>2</issue>
          <fpage>e12239</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/2/e12239/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/12239</pub-id>
          <pub-id pub-id-type="medline">31066697</pub-id>
          <pub-id pub-id-type="pii">v7i2e12239</pub-id>
          <pub-id pub-id-type="pmcid">PMC6528438</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Soguero-Ruiz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mikalsen</surname>
              <given-names>KO</given-names>
            </name>
            <name name-style="western">
              <surname>Lindsetmo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kouskoumvekaki</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Girolami</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Skrovseth</surname>
              <given-names>SO</given-names>
            </name>
            <name name-style="western">
              <surname>Augestad</surname>
              <given-names>KM</given-names>
            </name>
          </person-group>
          <article-title>Analysis of free text in electronic health records for identification of cancer patient trajectories</article-title>
          <source>Sci Rep</source>
          <year>2017</year>
          <month>04</month>
          <day>7</day>
          <volume>7</volume>
          <fpage>46226</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/srep46226"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/srep46226</pub-id>
          <pub-id pub-id-type="medline">28387314</pub-id>
          <pub-id pub-id-type="pii">srep46226</pub-id>
          <pub-id pub-id-type="pmcid">PMC5384191</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosenbloom</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lorenzi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Stead</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>KB</given-names>
            </name>
          </person-group>
          <article-title>Data from clinical notes: a perspective on the tension between structure and flexible documentation</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2011</year>
          <volume>18</volume>
          <issue>2</issue>
          <fpage>181</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21233086"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2010.007237</pub-id>
          <pub-id pub-id-type="medline">21233086</pub-id>
          <pub-id pub-id-type="pii">jamia.2010.007237</pub-id>
          <pub-id pub-id-type="pmcid">PMC3116264</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spasic</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Nenadic</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Clinical text data in machine learning: systematic review</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>03</month>
          <day>31</day>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>e17984</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/3/e17984/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17984</pub-id>
          <pub-id pub-id-type="medline">32229465</pub-id>
          <pub-id pub-id-type="pii">v8i3e17984</pub-id>
          <pub-id pub-id-type="pmcid">PMC7157505</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stearns</surname>
              <given-names>MQ</given-names>
            </name>
            <name name-style="western">
              <surname>Price</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Spackman</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>AY</given-names>
            </name>
          </person-group>
          <article-title>SNOMED clinical terms: overview of the development process and project status</article-title>
          <source>Proc AMIA Symp</source>
          <year>2001</year>
          <fpage>662</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/11825268"/>
          </comment>
          <pub-id pub-id-type="medline">11825268</pub-id>
          <pub-id pub-id-type="pii">D010001608</pub-id>
          <pub-id pub-id-type="pmcid">PMC2243297</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ganesan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>RxNorm: prescription for electronic drug information exchange</article-title>
          <source>IT Prof</source>
          <year>2005</year>
          <month>09</month>
          <volume>7</volume>
          <issue>5</issue>
          <fpage>17</fpage>
          <lpage>23</lpage>
          <pub-id pub-id-type="doi">10.1109/MITP.2005.122</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guttmacher</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>FS</given-names>
            </name>
            <name name-style="western">
              <surname>Carmona</surname>
              <given-names>RH</given-names>
            </name>
          </person-group>
          <article-title>The family history-more important than ever</article-title>
          <source>N Engl J Med</source>
          <year>2004</year>
          <month>11</month>
          <day>25</day>
          <volume>351</volume>
          <issue>22</issue>
          <fpage>2333</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMsb042979</pub-id>
          <pub-id pub-id-type="medline">15564550</pub-id>
          <pub-id pub-id-type="pii">351/22/2333</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dick</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Steen</surname>
              <given-names>EB</given-names>
            </name>
            <name name-style="western">
              <surname>Detmer</surname>
              <given-names>DE</given-names>
            </name>
          </person-group>
          <source>The Computer-Based Patient Record: An Essential Technology for Health Care, Revised Edition</source>
          <year>1997</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>The National Academies Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Systematic analysis of free-text family history in electronic health record</article-title>
          <source>AMIA Jt Summits Transl Sci Proc</source>
          <year>2017</year>
          <volume>2017</volume>
          <fpage>104</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28815117"/>
          </comment>
          <pub-id pub-id-type="medline">28815117</pub-id>
          <pub-id pub-id-type="pmcid">PMC5543380</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>João Rafael</given-names>
            </name>
            <name name-style="western">
              <surname>Matos</surname>
              <given-names>Sérgio</given-names>
            </name>
          </person-group>
          <article-title>Rule-based Extraction of Family History Information From Clinical Notes</article-title>
          <source>Proceedings of the 35th Annual ACM Symposium on Applied Computing</source>
          <year>2020</year>
          <month>03</month>
          <conf-name>SAC'20</conf-name>
          <conf-date>March 30-April 3, 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3341105.3374000</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <article-title>PatientFM: An end-to-end system for extracting family history information from clinical notes</article-title>
          <source>GitHub</source>
          <year>2020</year>
          <access-date>2020-12-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/bioinformatics-ua/PatientFM">https://github.com/bioinformatics-ua/PatientFM</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ching</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Himmelstein</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Beaulieu-Jones</surname>
              <given-names>BK</given-names>
            </name>
            <name name-style="western">
              <surname>Kalinin</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Do</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Way</surname>
              <given-names>GP</given-names>
            </name>
            <name name-style="western">
              <surname>Ferrero</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Agapow</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zietz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffman</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rosen</surname>
              <given-names>GL</given-names>
            </name>
            <name name-style="western">
              <surname>Lengerich</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Israeli</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lanchantin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Woloszynek</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Carpenter</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Shrikumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cofer</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Lavender</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Turaga</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Alexandari</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>DeCaprio</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kundaje</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wiley</surname>
              <given-names>LK</given-names>
            </name>
            <name name-style="western">
              <surname>Segler</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Boca</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Swamidass</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gitter</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Greene</surname>
              <given-names>CS</given-names>
            </name>
          </person-group>
          <article-title>Opportunities and obstacles for deep learning in biology and medicine</article-title>
          <source>J R Soc Interface</source>
          <year>2018</year>
          <month>04</month>
          <volume>15</volume>
          <issue>141</issue>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29618526"/>
          </comment>
          <pub-id pub-id-type="doi">10.1098/rsif.2017.0387</pub-id>
          <pub-id pub-id-type="medline">29618526</pub-id>
          <pub-id pub-id-type="pii">rsif.2017.0387</pub-id>
          <pub-id pub-id-type="pmcid">PMC5938574</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goryachev</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng-Treitler</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Identification and extraction of family history information from clinical reports</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2008</year>
          <month>11</month>
          <day>6</day>
          <fpage>247</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18999129"/>
          </comment>
          <pub-id pub-id-type="medline">18999129</pub-id>
          <pub-id pub-id-type="pmcid">PMC2656021</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Using a natural language processing system to extract and code family history data from admission reports</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2006</year>
          <fpage>-</fpage>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17238544"/></comment>
          <pub-id pub-id-type="medline">17238544</pub-id>
          <pub-id pub-id-type="pii">86427</pub-id>
          <pub-id pub-id-type="pmcid">PMC1839517</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ferrucci</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lally</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>UIMA: an architectural approach to unstructured information processing in the corporate research environment</article-title>
          <source>Nat Lang Eng</source>
          <year>1999</year>
          <volume>10</volume>
          <issue>3-4</issue>
          <fpage>327</fpage>
          <lpage>48</lpage>
          <pub-id pub-id-type="doi">10.1017/s1351324904003523</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>GATE, a general architecture for text engineering</article-title>
          <source>Comput Hum</source>
          <year>2002</year>
          <volume>36</volume>
          <issue>2</issue>
          <fpage>223</fpage>
          <lpage>54</lpage>
          <pub-id pub-id-type="doi">10.3115/974281.974299</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Masanz</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ogren</surname>
              <given-names>PV</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kipper-Schuler</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Mayo clinical text analysis and knowledge extraction system (cTAKES): architecture, component evaluation and applications</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>507</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20819853"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id>
          <pub-id pub-id-type="medline">20819853</pub-id>
          <pub-id pub-id-type="pii">17/5/507</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995668</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bird</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Loper</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <source>Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit 1st Edition</source>
          <year>2009</year>
          <publisher-loc>California, United States</publisher-loc>
          <publisher-name>O'Reilly Media</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <article-title>Welcome to Apache OpenNLP The Apache OpenNLP library is a machine learning based toolkit for the processing of natural language text</article-title>
          <source>Apache OpenNLP</source>
          <access-date>2020-06-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://opennlp.apache.org/">http://opennlp.apache.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Surdeanu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bauer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Finkel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bethard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McClosky</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The Stanford CoreNLP Natural Language Processing Toolkit</article-title>
          <source>Proceedings of 52nd Annual Meeting of the Association for Computational Linguistics: System Demonstrations</source>
          <year>2014</year>
          <conf-name>ACL'14</conf-name>
          <conf-date>June 23-24, 2014</conf-date>
          <conf-loc>Baltimore, Maryland</conf-loc>
          <fpage>55</fpage>
          <lpage>60</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/p14-5010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soysal</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>CLAMP - a toolkit for efficiently building customized clinical natural language processing pipelines</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>03</month>
          <day>1</day>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>331</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29186491"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx132</pub-id>
          <pub-id pub-id-type="medline">29186491</pub-id>
          <pub-id pub-id-type="pii">4657212</pub-id>
          <pub-id pub-id-type="pmcid">PMC7378877</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Campos</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Matos</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Oliveira</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>A modular framework for biomedical concept recognition</article-title>
          <source>BMC Bioinformatics</source>
          <year>2013</year>
          <month>09</month>
          <day>24</day>
          <volume>14</volume>
          <fpage>281</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-14-281"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-14-281</pub-id>
          <pub-id pub-id-type="medline">24063607</pub-id>
          <pub-id pub-id-type="pii">1471-2105-14-281</pub-id>
          <pub-id pub-id-type="pmcid">PMC3849280</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Matos</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Configurable web-services for biomedical document annotation</article-title>
          <source>J Cheminform</source>
          <year>2018</year>
          <month>12</month>
          <day>21</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>68</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.doi.org/10.1186/s13321-018-0317-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13321-018-0317-4</pub-id>
          <pub-id pub-id-type="medline">30578450</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13321-018-0317-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC6755557</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed Representations of Words and Phrases and Their Compositionality</article-title>
          <source>Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2</source>
          <year>2013</year>
          <conf-name>NIPS'13</conf-name>
          <conf-date>December 5-10, 2013</conf-date>
          <conf-loc>Lake Tahoe</conf-loc>
          <fpage>3111</fpage>
          <lpage>9</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>BioWordVec, improving biomedical word embeddings with subword information and MeSH</article-title>
          <source>Sci Data</source>
          <year>2019</year>
          <month>05</month>
          <day>10</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>52</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-019-0055-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="medline">31076572</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6510737</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Deep Contextualized Word Representations</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</source>
          <year>2018</year>
          <conf-name>NAACL'18</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, Louisiana</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1202</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</source>
          <year>2019</year>
          <conf-name>NAACL'19</conf-name>
          <conf-date>June 2-7, 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota</conf-loc>
          <pub-id pub-id-type="doi">10.3115/1614108</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Publicly Available Clinical BERT Embeddings</article-title>
          <source>Proceedings of the 2nd Clinical Natural Language Processing Workshop</source>
          <year>2019</year>
          <conf-name>ClinicalNLP'19</conf-name>
          <conf-date>June 7, 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota, USA</conf-loc>
          <fpage>72</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Named Entity Recognition Using BERT BiLSTM CRF for Chinese Electronic Health Records</article-title>
          <source>2019 12th International Congress on Image and Signal Processing, BioMedical Engineering and Informatics (CISP-BMEI)</source>
          <year>2019</year>
          <conf-name>CISP-BMEI'19</conf-name>
          <conf-date>October 19-21, 2019</conf-date>
          <conf-loc>Suzhou, China</conf-loc>
          <fpage>1</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/abstract/document/8965823"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/cisp-bmei48845.2019.8965823</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>An attention-based deep learning model for clinical named entity recognition of Chinese electronic medical records</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2019</year>
          <month>12</month>
          <day>5</day>
          <volume>19</volume>
          <issue>Suppl 5</issue>
          <fpage>235</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-019-0933-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-019-0933-6</pub-id>
          <pub-id pub-id-type="medline">31801540</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-019-0933-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC6894110</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Family history information extraction via deep joint learning</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2019</year>
          <month>12</month>
          <day>27</day>
          <volume>19</volume>
          <issue>Suppl 10</issue>
          <fpage>277</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-019-0995-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-019-0995-5</pub-id>
          <pub-id pub-id-type="medline">31881967</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-019-0995-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6933634</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <article-title>2019 n2c2 Shared-task and Workshop, Track 2: n2c2/ohnlp Track on Family History Extraction</article-title>
          <source>Harvard Medical School</source>
          <access-date>2020-06-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://n2c2.dbmi.hms.harvard.edu/track2">https://n2c2.dbmi.hms.harvard.edu/track2</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bodenreider</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>The unified medical language system (UMLS): integrating biomedical terminology</article-title>
          <source>Nucleic Acids Res</source>
          <year>2004</year>
          <month>01</month>
          <day>1</day>
          <volume>32</volume>
          <issue>Database issue</issue>
          <fpage>D267</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/14681409"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkh061</pub-id>
          <pub-id pub-id-type="medline">14681409</pub-id>
          <pub-id pub-id-type="pii">32/suppl_1/D267</pub-id>
          <pub-id pub-id-type="pmcid">PMC308795</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pradhan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Elhadad</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Manandhar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Semeval-2014 Task 7: Analysis of Clinical Text</article-title>
          <source>Proceedings of the 8th International Workshop on Semantic Evaluation (SemEval 2014)</source>
          <year>2015</year>
          <conf-name>SemEval'14</conf-name>
          <conf-date>August 23-24, 2014</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <fpage>a</fpage>
          <pub-id pub-id-type="doi">10.3115/v1/s14-2007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Information extraction from electronic medical records using multitask recurrent neural network with contextual word embedding</article-title>
          <source>Appl Sci</source>
          <year>2019</year>
          <month>09</month>
          <day>4</day>
          <volume>9</volume>
          <issue>18</issue>
          <fpage>3658</fpage>
          <pub-id pub-id-type="doi">10.3390/app9183658</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
