<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i1e31063</article-id>
      <article-id pub-id-type="pmid">35076407</article-id>
      <article-id pub-id-type="doi">10.2196/31063</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Development of a Pipeline for Adverse Drug Reaction Identification in Clinical Notes: Word Embedding Models and String Matching</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kuo</surname>
            <given-names>Kuang-Ming</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Goldman</surname>
            <given-names>Jean-Philippe</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Siegersma</surname>
            <given-names>Klaske R</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6377-6867</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Evers</surname>
            <given-names>Maxime</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8988-3190</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Bots</surname>
            <given-names>Sophie H</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4483-5582</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Groepenhoff</surname>
            <given-names>Floor</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1583-701X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Appelman</surname>
            <given-names>Yolande</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5241-7105</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Hofstra</surname>
            <given-names>Leonard</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4432-4720</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Tulevski</surname>
            <given-names>Igor I</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8507-1531</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Somsen</surname>
            <given-names>G Aernout</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5690-4429</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>den Ruijter</surname>
            <given-names>Hester M</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9762-014X</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Spruit</surname>
            <given-names>Marco</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9237-221X</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Onland-Moret</surname>
            <given-names>N Charlotte</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff7" ref-type="aff">7</xref>
          <address>
            <institution>Department of Epidemiology</institution>
            <institution>Julius Center for Health Sciences and Primary Care</institution>
            <institution>University Medical Center Utrecht, Utrecht University</institution>
            <addr-line>Universiteitsweg 100</addr-line>
            <addr-line>Utrecht, 3584 CG</addr-line>
            <country>Netherlands</country>
            <phone>31 887569610</phone>
            <email>N.C.Onland@umcutrecht.nl</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2360-913X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Laboratory of Experimental Cardiology</institution>
        <institution>University Medical Center Utrecht</institution>
        <institution>Utrecht University</institution>
        <addr-line>Utrecht</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Cardiology</institution>
        <institution>Amsterdam University Medical Centers, VU University Medical Center</institution>
        <addr-line>Amsterdam</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Central Diagnostic Laboratory</institution>
        <institution>University Medical Center Utrecht</institution>
        <institution>Utrecht University</institution>
        <addr-line>Utrecht</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Cardiology Centers of the Netherlands</institution>
        <addr-line>Utrecht</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Public Health and Primary Care</institution>
        <institution>Leiden University Medical Center</institution>
        <institution>Leiden University</institution>
        <addr-line>Leiden</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Leiden Institute of Advanced Computer Science</institution>
        <institution>Leiden University</institution>
        <addr-line>Leiden</addr-line>
        <country>Netherlands</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Department of Epidemiology</institution>
        <institution>Julius Center for Health Sciences and Primary Care</institution>
        <institution>University Medical Center Utrecht, Utrecht University</institution>
        <addr-line>Utrecht</addr-line>
        <country>Netherlands</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: N Charlotte Onland-Moret <email>N.C.Onland@umcutrecht.nl</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>1</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>25</day>
        <month>1</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>1</issue>
      <elocation-id>e31063</elocation-id>
      <history>
        <date date-type="received">
          <day>10</day>
          <month>6</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>23</day>
          <month>9</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>2</day>
          <month>11</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>14</day>
          <month>11</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Klaske R Siegersma, Maxime Evers, Sophie H Bots, Floor Groepenhoff, Yolande Appelman, Leonard Hofstra, Igor I Tulevski, G Aernout Somsen, Hester M den Ruijter, Marco Spruit, N Charlotte Onland-Moret. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 25.01.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/1/e31063" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Knowledge about adverse drug reactions (ADRs) in the population is limited because of underreporting, which hampers surveillance and assessment of drug safety. Therefore, gathering accurate information that can be retrieved from clinical notes about the incidence of ADRs is of great relevance. However, manual labeling of these notes is time-consuming, and automatization can improve the use of free-text clinical notes for the identification of ADRs. Furthermore, tools for language processing in languages other than English are not widely available.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this study is to design and evaluate a method for automatic extraction of medication and Adverse Drug Reaction Identification in Clinical Notes (ADRIN).</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Dutch free-text clinical notes (N=277,398) and medication registrations (N=499,435) from the Cardiology Centers of the Netherlands database were used. All clinical notes were used to develop word embedding models. Vector representations of word embedding models and string matching with a medical dictionary (Medical Dictionary for Regulatory Activities [MedDRA]) were used for identification of ADRs and medication in a test set of clinical notes that were manually labeled. Several settings, including search area and punctuation, could be adjusted in the prototype to evaluate the optimal version of the prototype.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The ADRIN method was evaluated using a test set of 988 clinical notes written on the stop date of a drug. Multiple versions of the prototype were evaluated for a variety of tasks. Binary classification of ADR presence achieved the highest accuracy of 0.84. Reduced search area and inclusion of punctuation improved performance, whereas incorporation of the MedDRA did not improve the performance of the pipeline.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The ADRIN method and prototype are effective in recognizing ADRs in Dutch clinical notes from cardiac diagnostic screening centers. Surprisingly, incorporation of the MedDRA did not result in improved identification on top of word embedding models. The implementation of the ADRIN tool may help increase the identification of ADRs, resulting in better care and saving substantial health care costs.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>adverse drug reactions</kwd>
        <kwd>word embeddings</kwd>
        <kwd>clinical notes</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Literature shows that adverse drug events (ADEs) and, more specifically, adverse drug reactions (ADRs) are structurally underreported [<xref ref-type="bibr" rid="ref1">1</xref>]. Clinical trials may underreport or miss ADRs for various reasons, such as a follow-up that is usually too short to catch long-term effects [<xref ref-type="bibr" rid="ref2">2</xref>]. In addition, the study population may be healthier or otherwise different from the target population in regular care [<xref ref-type="bibr" rid="ref3">3</xref>]. As a result, the ADR risk of clinically relevant subgroups such as women and older adults remains unknown [<xref ref-type="bibr" rid="ref4">4</xref>], which places a societal and economic burden on our health care system. The prevalence of hospital admissions associated with ADRs is reported to be as high as 5.3% and estimated to be twice as high in the older adult population [<xref ref-type="bibr" rid="ref5">5</xref>]. In the United States alone, ADRs are estimated to generate US $30 billion in unnecessary costs [<xref ref-type="bibr" rid="ref6">6</xref>]. Efforts have been made to structurally collect information on ADRs both on a national (eg, Lareb in the Netherlands) and international (EudraVigilance [<xref ref-type="bibr" rid="ref7">7</xref>]) level; however, these pharmacovigilance databases do not include relevant patient characteristics and information about prescription rates.</p>
        <p>Regular care data extracted from electronic health records can help in postmarketing surveillance of medication. ADRs are usually not reported in the electronic health record in a structured way, but the clinical notes made during consultations between patients and their physicians may hold relevant information when patients experience an ADR. However, these notes are often stored as free text and thus cannot be easily analyzed [<xref ref-type="bibr" rid="ref8">8</xref>]. Methods that extract ADRs from these free-text fields are needed to access the full potential of these data.</p>
        <p>Natural language processing (NLP) techniques can aid in the differentiation of relevant features from idle free text and prepare free text for research purposes [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. One of the widespread topics in NLP is the use of word embeddings—a vector representation of a text, often established through evaluation of the word’s context. The use of word embeddings for the evaluation of clinical free text for research purposes is increasing [<xref ref-type="bibr" rid="ref11">11</xref>]. Research has shown that training word embedding models on a domain-specific data set generates better results than training on a general data set [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. As a result, applications of word embedding models are studied in a wide range of topics within the health care domain (eg, evaluation of radiology reports [<xref ref-type="bibr" rid="ref14">14</xref>], identification of ICD-10 codes [<xref ref-type="bibr" rid="ref15">15</xref>], and identification of ADEs in English electronic health records [<xref ref-type="bibr" rid="ref16">16</xref>]) and can potentially be a solution to extract ADRs from Dutch clinical notes.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>The objective of this research is to design a method for the identification of ADRs in clinical notes from a regular care database (Adverse Drug Reaction Identification in Clinical Notes [ADRIN]) using unlabeled data and word embeddings. Although the demonstrations in this study have been done with Dutch clinical notes from the cardiovascular domain, the method has been developed in a way that enables generalization not only to other languages but also to other research questions to mine text in clinical notes.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>The ADRIN method is based on the implementation of a medical taxonomy to enhance standardized terminology (the Medical Dictionary for Regulatory Activities [MedDRA]) [<xref ref-type="bibr" rid="ref17">17</xref>] and on word embeddings trained on a large database of medical free text. In addition, a prototype was developed and evaluated on labeled Dutch clinical notes to determine the performance of this method. <xref rid="figure1" ref-type="fig">Figure 1</xref> shows the general workflow of the ADRIN method.</p>
        <p>This study focused on the identification of ADRs and the corresponding medications. We assumed that patients were compliant with their medication regimen. We defined an ADR as any unwanted event that led to the discontinuation of the prescribed medication. In the following description, clinical notes are defined as the free text written down in the electronic health record by the physician after a patient’s consultation.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Overview of the different steps in the Adverse Drug Reaction Identification in Clinical Notes method. ADR: adverse drug reaction; MedDRA: Medical Dictionary for Regulatory Activities.</p>
          </caption>
          <graphic xlink:href="medinform_v10i1e31063_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Set</title>
        <p>The Cardiology Centers of the Netherlands database is a large regular care database from 13 diagnostic cardiac screening centers. In short, this database consists of 109,151 patients who visited one of the outpatient cardiac screening centers between 2007 and 2018 and includes patient characteristics and information about diagnostic tests [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        <p>In total, there were 277,398 clinical notes in the database and 499,435 medication prescriptions. Clinical notes were deidentified using DEDUCE [<xref ref-type="bibr" rid="ref19">19</xref>]. Medication prescriptions contain information about the prescribed medication, start date and end date (if the medication was discontinued at some point), and reason for discontinuation in free text.</p>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> describes the selection of discontinued medication entries from the database. The selected prescriptions were merged with the clinical notes. This resulted in 91,273 discontinued medication entries for which a clinical note was available on the end date of the medication. In cases where multiple prescriptions from the same patient were stopped on the same day (19,992/91,273, 21.9%), the same clinical note was used for all prescriptions. The reason for discontinuation was reported in 40% (36,508/91,273) of the medication prescriptions. From these 91,273 medication entries, we randomly selected 1000 (1.1%) medication entries and corresponding clinical notes as a test set. However, in 1.2% (12/1000) of the cases, the clinical note was empty, resulting in a test set of 988 clinical notes.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Flowchart of selection of clinical notes and corresponding adverse drug reaction and medication. ADR: adverse drug reaction.</p>
          </caption>
          <graphic xlink:href="medinform_v10i1e31063_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The validation set was obtained from discontinued medication entries and consisted of all medication stops with an ADR reported as a reason for discontinuation and a random selection of 1600 medication stops that were not ADR-related. The latter selection was made because we expected that the clinical notes corresponding to these medication stops might also contain information on possible ADRs. Thus, this selection made it more likely that medication and ADRs would be identified when compared with a random selection of all clinical notes (<xref rid="figure2" ref-type="fig">Figure 2</xref>). These 2 selections of medication stops were merged with the corresponding clinical notes and resulted in a data set of 3000 unique clinical notes as there were some notes linked to medication stops that reported ADRs as well as medication stops that did not report an ADR.</p>
        <p>The Medical Research Ethics Committee of the University Medical Center Utrecht declared that research within the Cardiology Centers of the Netherlands database does not fall under the Dutch Medical Research Involving Human Subjects Act (proposal number 17/359).</p>
      </sec>
      <sec>
        <title>Labeling</title>
        <p>In total, 2 researchers (KRS and ME) independently labeled all clinical notes in the test set. Clinical notes containing ADR information were labeled as positive. When a note was labeled positively, all words in the text describing the medication and ADR combinations were extracted. Discrepancies in labeling between the 2 researchers were discussed, and interobserver variability was evaluated. Furthermore, a validation data set of 3000 unique clinical notes was labeled by one of the researchers (either KRS or ME). These notes were used for identification of thresholds for the word embedding models and for intermediate, qualitative, and direct feedback.</p>
      </sec>
      <sec>
        <title>Preprocessing Clinical Notes</title>
        <p>Before applying word embedding models to the clinical notes, the text underwent multiple preprocessing steps. First, all text was converted to lowercase and unidecoded. Second, the clinical notes were tokenized with a regular expression tokenizer set to greedy tokenization for every word in the presented text. Third, all numerical tokens were converted into their written form (number normalization [<xref ref-type="bibr" rid="ref20">20</xref>]). It is assumed that this results in numbers being more closely related in vector space (ie, <italic>16</italic> and <italic>18</italic> vs <italic>sixteen</italic> and <italic>eighteen</italic>). Doses were removed from the text using regular expressions. The doses were removed to reduce the similarity between frequently prescribed doses and specific medications. This would otherwise contaminate the word embedding models used for identification of medication. Finally, for each token, a check was performed to determine if the token was in the unigram word embedding model. If this was not the case, the word was removed from the list of tokens. An example of a text going through this process is presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, Figure S1. The text was preprocessed using Python version 3.7.9 (Python Software Foundation [<xref ref-type="bibr" rid="ref21">21</xref>]) using the nltk package (version 3.5) [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
      <sec>
        <title>Word Embedding Models</title>
        <p>For the automatic identification of ADRs from the text, word embedding models were developed. In total, 2 Word2Vec models imported from the Python Gensim package (version 3.8.0) [<xref ref-type="bibr" rid="ref23">23</xref>] were trained on the complete set of 277,398 clinical notes [<xref ref-type="bibr" rid="ref24">24</xref>]. A unigram model was developed using vectors for single words. This model included all words and derived vectors that occurred more than once in the complete set of clinical notes. The second model used a combination of single words, bigrams, and the derived vectors (bigram model). For the development of this model, words that occurred together &#62;5 times were represented as a vector. Stop words imported from the nltk package [<xref ref-type="bibr" rid="ref22">22</xref>] were removed from the text. A skipgram approach was used.</p>
        <p>The Word2Vec settings were a vector size of 200 dimensions, a window of 5 words around the main word, and 5 iterations of learning. Word embedding models were qualitatively evaluated through inspection of the similarity among words [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
      <sec>
        <title>Identification of Medication and ADRs</title>
        <p>A list of search words was created for both medication and ADRs. The medication search list was based on different groups of cardiovascular medications (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S1). For ADR identification, the most frequently reported ADRs (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S2) in the discontinued medication entries were considered. From these ADRs, a list of search words for ADR recognition was compiled (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S1).</p>
        <p>Word embeddings were used for evaluation of the clinical note. First, the cosine similarity between each word in the clinical note and the search words for medication was calculated. A medication was identified if the cosine similarity was above a predefined threshold (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S1). If no medication was found in the text, a second search was performed to identify a mention of ADRs using more general search words such as <italic>adverse drug reaction</italic>. If these search words were also not identified in the text, the clinical note was automatically labeled as not containing an ADR (<xref rid="figure1" ref-type="fig">Figure 1</xref>, step 1).</p>
        <p>Second, after identification of a medication, the clinical note was searched for ADRs using a predefined search area around the identified medication (<xref rid="figure1" ref-type="fig">Figure 1</xref>, step 2). This search area was restricted to prevent an increasing number of false positives and could be adjusted if it seemed too strict or too wide. This was one of the settings adjusted during the evaluation of the pipeline.</p>
        <p>After this, the area was checked for <italic>non-ADR keywords</italic>. These words occurred immediately before or after the medication and indicated a medication change or extension, such as <italic>increase</italic> and <italic>double</italic>. Therefore, these words did not indicate the presence of an ADR. List comparison was used, in which the tokenized form of the clinical note was compared with a list of words that pointed toward a medication change not likely because of an ADR (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S3).</p>
        <p>The final step in the search for ADRs was the actual identification (<xref rid="figure1" ref-type="fig">Figure 1</xref>, step 3). In total, 2 sequential approaches were developed for this purpose. The first approach included the application of the MedDRA. A selection of the lower-level MedDRA terms (Lowest Level Terms) [<xref ref-type="bibr" rid="ref17">17</xref>] was checked with text retrieval and string matching in the defined search area around the medication. Inclusion or exclusion of the MedDRA was one of the settings adjusted during the evaluation of the pipeline.</p>
        <p>The second approach for identification of ADRs was the use of unigram and bigram word embedding models. For each word in the search area, the cosine similarity with the search words for ADRs was computed (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S1). If this similarity was above the predefined threshold, the word was identified as an ADR. Threshold-setting was performed using a grid search. Visual inspection of the graphical representation of the number of correct matches for a specific word (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, Figure S2) and evaluation of the included words after inspection of the list of most similar words resulted in the setting of the thresholds. For example, in the case of a specific medication, the threshold was set such that spelling mistakes and closely related medications were selected but not words that were related to a significant other medication group or words that did not describe medication but a certain disease or condition. For this analysis, the validation data set was used. This is explained in more detail in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Pipeline Versions and Tasks</title>
        <p>The pipeline was developed to execute four different tasks: (1) a binary classification of whether the clinical note contained an ADR (<xref rid="figure3" ref-type="fig">Figure 3</xref>A and <xref rid="figure4" ref-type="fig">Figure 4</xref>A), (2) the extraction of the medication that causes an ADR (<xref rid="figure3" ref-type="fig">Figure 3</xref>B and <xref rid="figure4" ref-type="fig">Figure 4</xref>B), (3) the extraction of the ADR individually (<xref rid="figure3" ref-type="fig">Figure 3</xref>C and <xref rid="figure4" ref-type="fig">Figure 4</xref>C), and (4) the exact extraction of the medication and corresponding ADR (<xref rid="figure3" ref-type="fig">Figure 3</xref>D and <xref rid="figure4" ref-type="fig">Figure 4</xref>D).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Performance of different experimental versions of the pipeline with the inclusion of the MedDRA on the different tasks (A: binary evaluation, B: medication identification, C: ADR identification, D: medication and ADR + adverse drug reaction identification). ADR: adverse drug reaction; MedDRA: Medical Dictionary for Regulatory Activities; NPV: negative predictive value; PPV: positive predictive value.</p>
          </caption>
          <graphic xlink:href="medinform_v10i1e31063_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Performance of different experimental versions of the pipeline without the use of the MedDRA on the different tasks (A: binary evaluation, B: medication identification, C: ADR identification, D: medication and ADR + adverse drug reaction identification). ADR: adverse drug reaction; MedDRA: Medical Dictionary for Regulatory Activities; NPV: negative predictive value; PPV: positive predictive value.</p>
          </caption>
          <graphic xlink:href="medinform_v10i1e31063_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Multiple settings were changed during the analysis to evaluate the performance of the predefined tasks of different experimental designs of the pipeline: inclusion or exclusion of the MedDRA for ADR identification, inclusion or neglect of punctuation for demarcation of the search area, and size of the search area. <xref ref-type="table" rid="table1">Table 1</xref> provides an overview of the different settings evaluated in the versions of the pipeline. Analysis of the pipeline was performed using Python version 3.7.9 [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Settings of the pipeline features of the different computational experiments.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="220"/>
            <col width="290"/>
            <col width="290"/>
            <col width="200"/>
            <thead>
              <tr valign="bottom">
                <td>Version</td>
                <td>Words in search area</td>
                <td>Considering punctuation</td>
                <td>Version without MedDRA<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1A</td>
                <td>All</td>
                <td>Yes</td>
                <td>1B</td>
              </tr>
              <tr valign="top">
                <td>2A</td>
                <td>All</td>
                <td>No</td>
                <td>2B</td>
              </tr>
              <tr valign="top">
                <td>3A</td>
                <td>10</td>
                <td>Yes</td>
                <td>3B</td>
              </tr>
              <tr valign="top">
                <td>4A</td>
                <td>10</td>
                <td>No</td>
                <td>4B</td>
              </tr>
              <tr valign="top">
                <td>5A</td>
                <td>5</td>
                <td>Yes</td>
                <td>5B</td>
              </tr>
              <tr valign="top">
                <td>6A</td>
                <td>5</td>
                <td>No</td>
                <td>6B</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>MedDRA: Medical Dictionary for Regulatory Activities.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Performance Metrics</title>
        <p>The pipeline was evaluated on the test set of 988 labeled clinical notes. Different metrics were calculated to assess the performance of different versions of the pipeline. The metrics that were calculated included accuracy and balanced accuracy, sensitivity, specificity, precision or positive predictive value, negative predictive value, recall, F<sub>1</sub> score, detection rate, and detection prevalence. An elaborate overview of the performance metrics and the evaluation process can be found in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>, Table S1 and Tables S2-S6, respectively. The outcome was evaluated using the R programming language version 4.0.2 (R Foundation for Statistical Computing [<xref ref-type="bibr" rid="ref26">26</xref>]) and RStudio version 1.3.1093 (RStudio Team [<xref ref-type="bibr" rid="ref27">27</xref>]). The caret package was used for evaluation (version 6.0-86) [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data Set</title>
        <p>The information on the complete data set for word embedding models, validation set, and test set is described in <xref ref-type="table" rid="table2">Table 2</xref>. The characteristics of the included free text are the informal writing style, use of abbreviations, and relatively short text length. <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> contains 4 different translated examples of clinical notes, as shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>, Table S2.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Characteristics of selected clinical notes for development of the word embedding models, validation set, and test set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="410"/>
            <col width="280"/>
            <col width="210"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td>Word embedding models</td>
                <td>Validation set</td>
                <td>Test set</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Language</td>
                <td>Dutch</td>
                <td>Dutch</td>
                <td>Dutch</td>
              </tr>
              <tr valign="top">
                <td>Number of unique records</td>
                <td>277,398</td>
                <td>3000</td>
                <td>988</td>
              </tr>
              <tr valign="top">
                <td>Unique patients</td>
                <td>108,940</td>
                <td>2707</td>
                <td>955</td>
              </tr>
              <tr valign="top">
                <td>Number of unique tokens</td>
                <td>96,086</td>
                <td>9297</td>
                <td>5464</td>
              </tr>
              <tr valign="top">
                <td>Number of tokens per record, mean (SD)</td>
                <td>54 (44)</td>
                <td>53 (44)</td>
                <td>53 (48)</td>
              </tr>
              <tr valign="top">
                <td>Number of tokens per record, median (IQR)</td>
                <td>43 (26-70)</td>
                <td>42 (25-67)</td>
                <td>41 (24-66)</td>
              </tr>
              <tr valign="top">
                <td>Individuals of the female sex, n (%)</td>
                <td>56,527 (51.89)</td>
                <td>1320 (49.07)</td>
                <td>459 (48.06)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Word Embedding Models</title>
        <p>Several search terms of the prototype were independently reviewed in the word embedding models to evaluate the performance of the word embedding models. <xref ref-type="table" rid="table3">Table 3</xref> lists a selection of these keywords and the 5 most similar words. It was noted that, if the search word was a specific group of medications (eg, β-blockers), other groups of medications were also identified (eg, <italic>diltiazem</italic> in the case of the search word β<italic>-blocker</italic>). As the identified word was used for the analysis and not the search word, this had no consequences for the analysis.</p>
        <p>Free text from clinical notes was used in the training of the word embedding models. These are domain-specific data, which can improve the embedding of domain-specific words. An illustrative example is the word embedding of <italic>red</italic>. In our word embedding models trained specifically on medical text, <italic>red</italic> was closely associated with <italic>itching</italic>, <italic>swollen</italic>, <italic>irritated</italic>, and <italic>colourings</italic>, whereas, in word embeddings on general text, <italic>red</italic> would be associated with other colors.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Selection of results from the word embedding models, adverse drug reaction, and medication search words, and a selection of the most relevant similar words where spelling mistakes are excluded. Similarity is based on the cosine similarity.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="240"/>
            <col width="760"/>
            <thead>
              <tr valign="top">
                <td>Keyword</td>
                <td>Most similar words in Dutch (English, cosine similarity)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td><italic>Pijn op de borst</italic> (chest pain)</td>
                <td><italic>Druk op de borst</italic> (chest pressure, 0.80), <italic>kramp op de borst</italic> (chest cramping, 0.70), <italic>pijn in de armen</italic> (pain in the arms, 0.68), and <italic>retrosternale pijn</italic> (retrosternal pain, 0.67)</td>
              </tr>
              <tr valign="top">
                <td><italic>Verminderde conditie</italic> (decreased condition)</td>
                <td><italic>Afname conditie</italic> (decreasing stamina, 0.63), <italic>conditieverlies</italic> (loss of condition, 0.63), <italic>verminderde inspanningstolerantie</italic> (decreased exercise tolerance, 0.62), and <italic>overmating transpireren</italic> (excessive sweating, 0.62)</td>
              </tr>
              <tr valign="top">
                <td><italic>Oedeem</italic> (edema)</td>
                <td><italic>Perifeer</italic> (peripheral edema, 0.81), <italic>enkeloedeem</italic> (ankle edema, 0.80), <italic>pitting</italic> (pitting edema, 0.80), and <italic>enkels</italic> (ankle edema, 0.75)</td>
              </tr>
              <tr valign="top">
                <td><italic>Hoesten</italic> (coughing)</td>
                <td><italic>Sputum</italic> (sputum, 0.75), <italic>slijm</italic> (mucus, 0.71), <italic>hoestklachten</italic> (coughing complaints, 0.70), and <italic>kuchen</italic> (to cough, 0.70)</td>
              </tr>
              <tr valign="top">
                <td><italic>Duizelig</italic> (dizziness)</td>
                <td><italic>Zweterig</italic> (sweaty, 0.73), <italic>misselijk</italic> (nauseous, 0.71), <italic>zweverig</italic> (floaty, 0.70), and <italic>draaierig</italic> (dizzy, 0.69)</td>
              </tr>
              <tr valign="top">
                <td><italic>Statine</italic> (statin)</td>
                <td><italic>Simvastatine</italic> (simvastatin, 0.80), <italic>pravastatine</italic> (pravastatin, 0.76), <italic>crestor</italic> (rosuvastatin, 0.75), and <italic>atorvastatine</italic> (atorvastatin, 0.74)</td>
              </tr>
              <tr valign="top">
                <td><italic>Betablokker</italic> (β-blocker)</td>
                <td>Metoprolol (0.74), atenolol (0.71), diltiazem (0.66), and bisoprolol (0.65)</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>Antistolling</italic>
                </td>
                <td><italic>Acenocoumarol</italic> (acenocoumarin, 0.80), <italic>anticoagulantia</italic> (anticoagulants, 0.78), <italic>NOAC</italic> (novel oral anticoagulant, 0.77), and <italic>fenprocoumon</italic> (phenprocoumon, 0.74)</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>Amlodipine</italic>
                </td>
                <td>Nifedipine (0.85), lisinopril (0.82), barnidipine (0.81), and enalapril (0.79)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Interobserver Variability</title>
        <p>A test set (n=988 clinical notes) was manually labeled by 2 independent researchers (KRS and ME) and used for the evaluation of the pipeline. During this process, 91.9% (908/988) of the clinical notes were identically labeled. This resulted in an interobserver variability of 91% for the binary presence of an ADR. Regarding the literal extraction of the ADR and the medication, there were 21.8% (215/988) of instances where the result differed among the researchers. This was mostly due to a difference in taking adjectives or adverbs into account or a different interpretation of the clinical note. As the pipeline was trained on 1-word and 2-word ADRs, it was decided that these words would not be considered.</p>
        <p>Manual labeling of the 988 clinical notes in the test set resulted in 23.9% (237/988) notes that were binary classified as containing an ADR. In the notes, 286 medication names (task 2) and 364 individual ADRs (task 3) were mentioned. These notes contained a total of 392 combinations of triggered ADRs (task 4) and corresponding medications.</p>
      </sec>
      <sec>
        <title>Evaluation of the Pipeline</title>
        <p><xref rid="figure3" ref-type="fig">Figures 3</xref> and <xref rid="figure4" ref-type="fig">4</xref> show the performance of the pipeline on the different metrics and for the different tasks. <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S4 shows the values for true and false negatives and true and false positives per version and per task. The task for binary classification achieved the highest accuracy, varying from 0.70 to 0.84 (<xref rid="figure3" ref-type="fig">Figure 3</xref>A). However, as this was the easiest task, the accuracy of the pipeline on the exact extraction of medication and ADR together was much lower, varying from 0.23 to 0.64 (<xref rid="figure3" ref-type="fig">Figure 3</xref>D).</p>
        <p>If we look at the specific settings of the different pipelines, the results show that the addition of the MedDRA to the pipeline did not lead to an increase in the performance of the pipeline (<xref rid="figure4" ref-type="fig">Figures 4</xref>A-4D). Overall, the inclusion of punctuation led to a better performance than transcending sentences (versions 1, 3, and 5), and a search area of 5 words seemed to lead to the best results overall (versions 5 and 6).</p>
        <p>The negative predictive value—the chance that no ADR was present when the pipeline did not produce an ADR—was approximately the same per task (0.69-0.91) for all versions of the pipeline. However, the positive predictive value (ie, the chance that, when the pipeline reported an ADR, it was in fact reported in the clinical notes) varied much more per version (<xref rid="figure3" ref-type="fig">Figures 3</xref> and <xref rid="figure4" ref-type="fig">4</xref>) and varied between 0.071 and 0.71. This could be explained by the proportion of false negatives. The proportion of false negatives did not vary much per version of the pipeline for a given task. However, the proportion of false positives had much more variety, caused by a change in the search area and the inclusion or exclusion of punctuation, which led to more ADRs found with a specific medication.</p>
        <p>The optimal version of the pipeline depends on the task for which the pipeline is used. If the task is to select notes based on whether they contain ADRs, the results of the binary classification task (task 1) are most relevant. For this task, version 3B (ie, no MedDRA used, search area of 10 words, and considering punctuation) generated the highest accuracy (0.84) and F<sub>1</sub> score (0.67). In this case, 8.1% (80/988) of notes were classified as false negatives, indicating that 8.1% (80/988) of notes would not be selected when looking for ADRs. The most optimal version based on accuracy for identification of medication, ADRs, and ADRs and medication combined was version 5B, with an accuracy for the different tasks of 0.75, 0.72, and 0.64, respectively. Version 3B was the optimal version when emphasis was on the F<sub>1</sub> score, with scores of 0.52, 0.52, and 0.35 for identification of medication, ADRs, and medication and ADRs combined, respectively.</p>
        <p>During the evaluation of the notes in the test set, the prototype incorporating the MedDRA required approximately 70 minutes to generate an outcome for all notes, whereas the versions without the MedDRA took approximately 14 seconds.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this study, the ADRIN method and a corresponding prototype were developed. The method was evaluated on a subset of clinical notes. Different versions of the prototype led to differing results on the various tasks. The optimal version of the pipeline depends on the task and the trade-off being made—Is it more valuable to find as many medication and ADR combinations as possible or to find fewer ADRs but also make fewer mistakes? If the goal is the former, a larger search area is better. However, even with the entire note as the search area, at least 8% of all medication and ADR combinations were missed. When one wants to be more accurate, a smaller search area is preferred, and punctuation should be considered. This reduces the number of false positives generated, which results in increased accuracy and F<sub>1</sub> score.</p>
        <p>Surprisingly, the versions incorporating the MedDRA performed worse on most tasks than the same versions without the MedDRA. The negative effect of the MedDRA on the performance was due to the large increase in false positives it generated. This was caused by string matching with the MedDRA, leading to more identifications than the specific set of frequently occurring ADRs defined by the predefined search words. Incorporation of the MedDRA could lead to an improved uptake of rare ADRs, but this was not evaluated in more detail. Furthermore, misspelled ADRs were not recognized by the MedDRA search, creating added value for the incorporation of word embedding models. Moreover, implementation of the MedDRA in the prototype significantly increased execution time, a significant attribute if real-time evaluation of clinical notes is required.</p>
        <p>Illustrative of the underreporting of ADRs is that, in 60% (54,765/91,273) of the discontinued medication entries, no reason was reported for ending the medication in the registration of a patient’s medication. However, 61.5% (36,564/59,426) of clinical notes were matched to these medication entries, which illustrates the potential additional value of clinical notes in unraveling ADRs in this data set.</p>
        <p>When we put these results in light of the ongoing developments of ADR extraction from clinical notes, we see that the performance of our pipeline is similar to that of other presented pipelines. First, most publications have focused on the automatic extraction of ADRs, ADEs, or adverse events [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref32">32</xref>], whereas our study identified the combination of medication and triggered ADR. Another publication that identified both ADR and medication showed increased performance, with F<sub>1</sub> scores for drug, ADR, and combination of drug and ADR of 0.930, 0.887, and 0.534, respectively [<xref ref-type="bibr" rid="ref33">33</xref>], versus the performance of 0.52, 0.51, and 0.34, respectively, that we showed. When comparing methodologies, our method predominantly relies on internal information and similarity from word embeddings, whereas Tang et al [<xref ref-type="bibr" rid="ref33">33</xref>] use external reference sources for the development of their dictionaries, which is the case in most studies. The use of word embeddings increases the identification of spelling mistakes in medication and ADRs, brand names, and synonyms. However, in our methodology, there were also an increased number of false positives.</p>
        <p>Thus, word embedding models can be used for the identification of spelling mistakes and brand names of medications. However, for the identification of synonyms, the use case must be critically evaluated. It was shown that words that indicated what was done with a specific prescription (eg, <italic>to lower</italic> and <italic>to increase</italic>) were considered similar by the word embedding models. Therefore, it is not suitable to use word embedding models for identification of <italic>non-ADR</italic> keywords, which was solved with string matching in the ADRIN method. The use of domain-specific word embedding models is not new or limited to ADR identification but is increasingly used in the evaluation of clinical notes (eg, in ICD-10 classification [<xref ref-type="bibr" rid="ref15">15</xref>] and anonymization [<xref ref-type="bibr" rid="ref34">34</xref>]).</p>
        <p>Second, publications on identification of ADRs in the English language are numerous, using different methods such as General Architecture for Text Engineering NLP [<xref ref-type="bibr" rid="ref35">35</xref>], trigger words [<xref ref-type="bibr" rid="ref30">30</xref>], or trigger phrases [<xref ref-type="bibr" rid="ref31">31</xref>]. Regarding foreign languages, the field is maturing. Methods developed for the English language can, in some cases, be transferred to other languages. However, the effort that must be put into this depends on the complexity of the task and the level of text interpretation [<xref ref-type="bibr" rid="ref36">36</xref>]. For example, a study of Danish clinical notes obtained better performance (recall of 0.75 in [<xref ref-type="bibr" rid="ref32">32</xref>] vs 0.59 in this study) for sole ADR identification. This study missed approximately one-fourth of all possible ADRs, whereas our optimal performance missed approximately 40%. However, this pipeline included manual dictionary selection and more rule-based filters in the model [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
        <p>We chose to use the presence of a mention of medication in the clinical note as the starting point for identification of an ADR. However, this might result in experienced ADRs being missed. The performance of the pipeline might benefit from the removal of the identification of medication and, for example, coupling with structured medication prescriptions to obtain information about medication use. However, the end user should be aware that this might also increase the number of false positives as the presence of an ADR is no longer limited by the presence of medication.</p>
        <p>Limitations that were identified during the evaluation of the method and prototype are primarily related to missed ADRs from the clinical free text even when the entire clinical note was used for analysis. This problem can be solved by lowering the identifying threshold, but this would also lead to a potentially large increase in false positives. The use of machine and deep learning models can improve the performance of the ADRIN method. However, a large data set of labeled clinical notes is required to train machine and deep learning models, which was unavailable during the development of this model.</p>
        <p>An overall limitation of the prototype is the direct translatability to other languages. The word embedding models were specifically trained on Dutch clinical notes. Search terms for word embedding functions must be translated into the new language to implement this method in clinical notes in a different language. Moreover, word embedding models must be trained with notes in the specific language before applying the developed method. Therefore, a large number of clinical free-text notes are required. Because of ethical and privacy constraints, this can be hard to acquire. However, it is technically possible to test and validate the ADRIN method in other languages through translation of search words and negations and after training word embedding models with the specific language.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, the ADRIN method and prototype are effective in recognizing ADRs in Dutch clinical notes. Surprisingly, incorporation of the MedDRA did not result in improved identification on top of word embedding models. However, not all versions of the prototype were equally accurate. Different parameter settings can be chosen for the prototype to optimize the task of the model. In a future stage, incorporation of a pipeline in an electronic health record environment can lead to automatic identification and registration of ADRs. This saves the physician’s precious time and decreases the previously mentioned underreporting of ADRs in clinical care, increasing our knowledge about ADRs, which might ultimately benefit the patient.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary methods: text preprocessing and threshold setting for word embedding models.</p>
        <media xlink:href="medinform_v10i1e31063_app1.docx" xlink:title="DOCX File , 162 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Overview of model settings and results.</p>
        <media xlink:href="medinform_v10i1e31063_app2.docx" xlink:title="DOCX File , 26 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Evaluation of the pipeline.</p>
        <media xlink:href="medinform_v10i1e31063_app3.docx" xlink:title="DOCX File , 19 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ADE</term>
          <def>
            <p>adverse drug event</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ADR</term>
          <def>
            <p>adverse drug reaction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">ADRIN</term>
          <def>
            <p>Adverse Drug Reaction Identification in Clinical Notes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MedDRA</term>
          <def>
            <p>Medical Dictionary for Regulatory Activities</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was funded by the Dutch Heart Foundation (CVON-AI 2018B017).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>KRS, ME, SHB, HMdR, MS, and NCOM designed the study. KRS and ME designed the analysis plan. ME coded the pipeline, and KRS improved and optimized the pipeline. ME and KRS labeled the clinical notes, and FG provided critical evaluation of the labeling. MS and NCOM supervised the project. All authors critically edited the manuscript, approved the final work, and agree to be accountable for the accuracy and integrity of the work.</p>
      </fn>
      <fn fn-type="conflict">
        <p>LH, GAS, and IIT are employed by the Cardiology Centers of the Netherlands.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hazell</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shakir</surname>
              <given-names>SA</given-names>
            </name>
          </person-group>
          <article-title>Under-reporting of adverse drug reactions : a systematic review</article-title>
          <source>Drug Saf</source>
          <year>2006</year>
          <volume>29</volume>
          <issue>5</issue>
          <fpage>385</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.2165/00002018-200629050-00003</pub-id>
          <pub-id pub-id-type="medline">16689555</pub-id>
          <pub-id pub-id-type="pii">2953</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seruga</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Templeton</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Badillo</surname>
              <given-names>FE</given-names>
            </name>
            <name name-style="western">
              <surname>Ocana</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Amir</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Tannock</surname>
              <given-names>IF</given-names>
            </name>
          </person-group>
          <article-title>Under-reporting of harm in clinical trials</article-title>
          <source>Lancet Oncol</source>
          <year>2016</year>
          <month>05</month>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>209</fpage>
          <lpage>19</lpage>
          <pub-id pub-id-type="doi">10.1016/S1470-2045(16)00152-2</pub-id>
          <pub-id pub-id-type="medline">27301048</pub-id>
          <pub-id pub-id-type="pii">S1470-2045(16)00152-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Leening</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Heeringa</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Deckers</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Franco</surname>
              <given-names>OH</given-names>
            </name>
            <name name-style="western">
              <surname>Hofman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Witteman</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Stricker</surname>
              <given-names>BH</given-names>
            </name>
          </person-group>
          <article-title>Healthy volunteer effect and cardiovascular risk</article-title>
          <source>Epidemiology</source>
          <year>2014</year>
          <month>05</month>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>470</fpage>
          <lpage>1</lpage>
          <pub-id pub-id-type="doi">10.1097/EDE.0000000000000091</pub-id>
          <pub-id pub-id-type="medline">24713887</pub-id>
          <pub-id pub-id-type="pii">00001648-201405000-00023</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Vries</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Denig</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ekhart</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Burgers</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Kleefstra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mol</surname>
              <given-names>PG</given-names>
            </name>
            <name name-style="western">
              <surname>van Puijenbroek</surname>
              <given-names>EP</given-names>
            </name>
          </person-group>
          <article-title>Sex differences in adverse drug reactions reported to the National Pharmacovigilance Centre in the Netherlands: An explorative observational study</article-title>
          <source>Br J Clin Pharmacol</source>
          <year>2019</year>
          <month>07</month>
          <volume>85</volume>
          <issue>7</issue>
          <fpage>1507</fpage>
          <lpage>15</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1111/bcp.13923"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/bcp.13923</pub-id>
          <pub-id pub-id-type="medline">30941789</pub-id>
          <pub-id pub-id-type="pmcid">PMC6595313</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kongkaew</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Noyce</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>Ashcroft</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Hospital admissions associated with adverse drug reactions: a systematic review of prospective observational studies</article-title>
          <source>Ann Pharmacother</source>
          <year>2008</year>
          <month>07</month>
          <volume>42</volume>
          <issue>7</issue>
          <fpage>1017</fpage>
          <lpage>25</lpage>
          <pub-id pub-id-type="doi">10.1345/aph.1L037</pub-id>
          <pub-id pub-id-type="medline">18594048</pub-id>
          <pub-id pub-id-type="pii">aph.1L037</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sultana</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cutroneo</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Trifirò</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Clinical and economic burden of adverse drug reactions</article-title>
          <source>J Pharmacol Pharmacother</source>
          <year>2013</year>
          <month>12</month>
          <volume>4</volume>
          <issue>Suppl 1</issue>
          <fpage>73</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jpharmacol.com/article.asp?issn=0976-500X;year=2013;volume=4;issue=5;spage=73;epage=77;aulast=Sultana"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/0976-500X.120957</pub-id>
          <pub-id pub-id-type="medline">24347988</pub-id>
          <pub-id pub-id-type="pii">JPP-4-73</pub-id>
          <pub-id pub-id-type="pmcid">PMC3853675</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Postigo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Brosch</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Slattery</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van Haren</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dogné</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kurz</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Candore</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Domergue</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Arlett</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>EudraVigilance medicines safety database: publicly accessible data for research and public health protection</article-title>
          <source>Drug Saf</source>
          <year>2018</year>
          <month>07</month>
          <volume>41</volume>
          <issue>7</issue>
          <fpage>665</fpage>
          <lpage>75</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29520645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s40264-018-0647-1</pub-id>
          <pub-id pub-id-type="medline">29520645</pub-id>
          <pub-id pub-id-type="pii">10.1007/s40264-018-0647-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC5990579</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murdoch</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Detsky</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>The inevitable application of big data to health care</article-title>
          <source>J Am Med Assoc</source>
          <year>2013</year>
          <month>04</month>
          <day>3</day>
          <volume>309</volume>
          <issue>13</issue>
          <fpage>1351</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2013.393</pub-id>
          <pub-id pub-id-type="medline">23549579</pub-id>
          <pub-id pub-id-type="pii">1674245</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sheikhalishahi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Miotto</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Lavelli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rinaldi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Osmani</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing of clinical notes on chronic diseases: systematic review</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>04</month>
          <day>27</day>
          <volume>7</volume>
          <issue>2</issue>
          <fpage>e12239</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/2/e12239/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/12239</pub-id>
          <pub-id pub-id-type="medline">31066697</pub-id>
          <pub-id pub-id-type="pii">v7i2e12239</pub-id>
          <pub-id pub-id-type="pmcid">PMC6528438</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Juhn</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence approaches using natural language processing to advance EHR-based clinical research</article-title>
          <source>J Allergy Clin Immunol</source>
          <year>2020</year>
          <month>02</month>
          <volume>145</volume>
          <issue>2</issue>
          <fpage>463</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31883846"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jaci.2019.12.897</pub-id>
          <pub-id pub-id-type="medline">31883846</pub-id>
          <pub-id pub-id-type="pii">S0091-6749(19)32604-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC7771189</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khattak</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Jeblee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pou-Prom</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Abdalla</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Meaney</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rudzicz</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>A survey of word embeddings for clinical text</article-title>
          <source>J Biomed Inform</source>
          <year>2019</year>
          <volume>100S</volume>
          <fpage>100057</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2590-177X(19)30056-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.yjbinx.2019.100057</pub-id>
          <pub-id pub-id-type="medline">34384583</pub-id>
          <pub-id pub-id-type="pii">S2590-177X(19)30056-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Masino</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A framework for developing and evaluating word embeddings of drug-named entity</article-title>
          <source>Proceedings of the BioNLP 2018 workshop</source>
          <year>2018</year>
          <conf-name>BioNLP 2018 workshop</conf-name>
          <conf-date>July 2018</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <fpage>156</fpage>
          <lpage>60</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w18-2319</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kingsbury</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A comparison of word embeddings for the biomedical natural language processing</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>09</month>
          <day>11</day>
          <fpage>12</fpage>
          <lpage>20</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.09.008</pub-id>
          <pub-id pub-id-type="medline">30217670</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30182-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Banerjee</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Madhavan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Goldman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Intelligent word embeddings of free-text radiology reports</article-title>
          <source>arXiv</source>
          <year>2017</year>
          <access-date>2022-01-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1711.06968">https://arxiv.org/abs/1711.06968</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sammani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bagheri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>van der Heijden</surname>
              <given-names>PG</given-names>
            </name>
            <name name-style="western">
              <surname>Te Riele</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Baas</surname>
              <given-names>AF</given-names>
            </name>
            <name name-style="western">
              <surname>Oosters</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Oberski</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Asselbergs</surname>
              <given-names>FW</given-names>
            </name>
          </person-group>
          <article-title>Automatic multilabel detection of ICD10 codes in Dutch cardiology discharge letters using neural networks</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <month>02</month>
          <day>26</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>37</fpage>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00404-9</pub-id>
          <pub-id pub-id-type="medline">33637859</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00404-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC7910461</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Adverse drug event and medication extraction in electronic health records via a cascading architecture with different sequence labeling models and word embeddings</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>01</month>
          <day>01</day>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>47</fpage>
          <lpage>55</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31334805"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz120</pub-id>
          <pub-id pub-id-type="medline">31334805</pub-id>
          <pub-id pub-id-type="pii">5537181</pub-id>
          <pub-id pub-id-type="pmcid">PMC7489070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>EG</given-names>
            </name>
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The medical dictionary for regulatory activities (MedDRA)</article-title>
          <source>Drug Saf</source>
          <year>1999</year>
          <month>02</month>
          <volume>20</volume>
          <issue>2</issue>
          <fpage>109</fpage>
          <lpage>17</lpage>
          <pub-id pub-id-type="doi">10.2165/00002018-199920020-00002</pub-id>
          <pub-id pub-id-type="medline">10082069</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bots</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Siegersma</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Onland-Moret</surname>
              <given-names>NC</given-names>
            </name>
            <name name-style="western">
              <surname>Asselbergs</surname>
              <given-names>FW</given-names>
            </name>
            <name name-style="western">
              <surname>Somsen</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Tulevski</surname>
              <given-names>II</given-names>
            </name>
            <name name-style="western">
              <surname>den Ruijter</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Hofstra</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Routine clinical care data from thirteen cardiac outpatient clinics: design of the Cardiology Centers of the Netherlands (CCN) database</article-title>
          <source>BMC Cardiovasc Disord</source>
          <year>2021</year>
          <month>06</month>
          <day>10</day>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>287</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmccardiovascdisord.biomedcentral.com/articles/10.1186/s12872-021-02020-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12872-021-02020-7</pub-id>
          <pub-id pub-id-type="medline">34112101</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12872-021-02020-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC8191101</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Menger</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Scheepers</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>van Wijk</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Spruit</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>DEDUCE: a pattern matching method for automatic de-identification of Dutch medical text</article-title>
          <source>Telemat Informat</source>
          <year>2018</year>
          <month>07</month>
          <volume>35</volume>
          <issue>4</issue>
          <fpage>727</fpage>
          <lpage>36</lpage>
          <pub-id pub-id-type="doi">10.1016/j.tele.2017.08.002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sproat</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Black</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ostendorf</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Normalization of non-standard words</article-title>
          <source>Comput Speech Lang</source>
          <year>2001</year>
          <month>7</month>
          <volume>15</volume>
          <issue>3</issue>
          <fpage>287</fpage>
          <lpage>333</lpage>
          <pub-id pub-id-type="doi">10.1006/csla.2001.0169</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <source>Python Language Reference, Version 3.7.9</source>
          <access-date>2022-01-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.python.org/">https://www.python.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bird</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>NLTK: The Natural Language Toolkit</article-title>
          <source>Proceedings of the COLING/ACL on Interactive Presentation Sessions</source>
          <year>2006</year>
          <conf-name>COLING/ACL on Interactive Presentation Sessions</conf-name>
          <conf-date>July 17 - 18, 2006</conf-date>
          <conf-loc>Sydney Australia</conf-loc>
          <fpage>69</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.3115/1225403.1225421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rehurek</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sojka</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Software framework for topic modelling with large corpora</article-title>
          <source>Proceedings of LREC 2010 workshop New Challenges for NLP Frameworks</source>
          <year>2010</year>
          <conf-name>LREC 2010 workshop New Challenges for NLP Frameworks</conf-name>
          <conf-date>2010</conf-date>
          <conf-loc>University of Malta, Valletta, Malta</conf-loc>
          <fpage>46</fpage>
          <lpage>50</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://is.muni.cz/publication/884893/en/Software-Framework-for-Topic-Modelling-with-Large-Corpora/Rehurek-Sojka"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Efficient estimation of word representations in vector space</article-title>
          <source>Proceedings of the 1st International Conference on Learning Representations, ICLR 2013</source>
          <year>2013</year>
          <conf-name>1st International Conference on Learning Representations, ICLR 2013</conf-name>
          <conf-date>May 2-4, 2013</conf-date>
          <conf-loc>Scottsdale, Arizona, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1301.3781"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kuo</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Evaluating word embedding models: methods and experimental results</article-title>
          <source>APSIPA Transactions on Signal and Information Processing</source>
          <year>2019</year>
          <month>07</month>
          <day>08</day>
          <volume>8</volume>
          <fpage>1</fpage>
          <lpage>13</lpage>
          <pub-id pub-id-type="doi">10.1017/atsip.2019.12</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>R Core Team</collab>
          </person-group>
          <article-title>R: A language and environment for statistical computing</article-title>
          <source>R Foundation for Statistical Computing, Vienna, Austria</source>
          <year>2020</year>
          <access-date>2022-01-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.r-project.org/">https://www.r-project.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>RStudio Team</collab>
          </person-group>
          <article-title>RStudio: integrated development environment for R</article-title>
          <source>RStudio, PBC, Boston, MA</source>
          <year>2021</year>
          <access-date>2022-01-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.rstudio.com/">https://www.rstudio.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>The caret Package</source>
          <year>2009</year>
          <access-date>2022-01-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.150.2466&#38;rep=rep1&#38;type=pdf">https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.150.2466&#38;rep=rep1&#38;type=pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Honigman</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rothschild</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Light</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pulling</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>DW</given-names>
            </name>
          </person-group>
          <article-title>Using computerized data to identify adverse drug events in outpatients</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2001</year>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>254</fpage>
          <lpage>66</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/11320070"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2001.0080254</pub-id>
          <pub-id pub-id-type="medline">11320070</pub-id>
          <pub-id pub-id-type="pmcid">PMC131033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murff</surname>
              <given-names>Harvey J</given-names>
            </name>
            <name name-style="western">
              <surname>Forster</surname>
              <given-names>Alan J</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>Josh F</given-names>
            </name>
            <name name-style="western">
              <surname>Fiskio</surname>
              <given-names>Julie M</given-names>
            </name>
            <name name-style="western">
              <surname>Heiman</surname>
              <given-names>Heather L</given-names>
            </name>
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>David W</given-names>
            </name>
          </person-group>
          <article-title>Electronically screening discharge summaries for adverse medical events</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2003</year>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>339</fpage>
          <lpage>50</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/12668691"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M1201</pub-id>
          <pub-id pub-id-type="medline">12668691</pub-id>
          <pub-id pub-id-type="pii">M1201</pub-id>
          <pub-id pub-id-type="pmcid">PMC181984</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cantor</surname>
              <given-names>MN</given-names>
            </name>
            <name name-style="western">
              <surname>Feldman</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Triola</surname>
              <given-names>MM</given-names>
            </name>
          </person-group>
          <article-title>Using trigger phrases to detect adverse drug reactions in ambulatory care notes</article-title>
          <source>Qual Saf Health Care</source>
          <year>2007</year>
          <month>04</month>
          <volume>16</volume>
          <issue>2</issue>
          <fpage>132</fpage>
          <lpage>4</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17403760"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/qshc.2006.020073</pub-id>
          <pub-id pub-id-type="medline">17403760</pub-id>
          <pub-id pub-id-type="pii">16/2/132</pub-id>
          <pub-id pub-id-type="pmcid">PMC2653150</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eriksson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Frankild</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Brunak</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Dictionary construction and identification of possible adverse drug events in Danish clinical narrative text</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2013</year>
          <volume>20</volume>
          <issue>5</issue>
          <fpage>947</fpage>
          <lpage>53</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://jamia.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=23703825"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2013-001708</pub-id>
          <pub-id pub-id-type="medline">23703825</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2013-001708</pub-id>
          <pub-id pub-id-type="pmcid">PMC3756275</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ang</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Dorajoo</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Foo</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Soh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Tham</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shek</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tung</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Detecting adverse drug reactions in discharge summaries of electronic medical records using Readpeer</article-title>
          <source>Int J Med Inform</source>
          <year>2019</year>
          <month>08</month>
          <volume>128</volume>
          <fpage>62</fpage>
          <lpage>70</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2019.04.017</pub-id>
          <pub-id pub-id-type="medline">31160013</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(18)31250-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdalla</surname>
              <given-names>Mohamed</given-names>
            </name>
            <name name-style="western">
              <surname>Abdalla</surname>
              <given-names>Moustafa</given-names>
            </name>
            <name name-style="western">
              <surname>Rudzicz</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hirst</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Using word embeddings to improve the privacy of clinical notes</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>06</month>
          <day>01</day>
          <volume>27</volume>
          <issue>6</issue>
          <fpage>901</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32388549"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa038</pub-id>
          <pub-id pub-id-type="medline">32388549</pub-id>
          <pub-id pub-id-type="pii">5835527</pub-id>
          <pub-id pub-id-type="pmcid">PMC7309261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Iqbal</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mallah</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Ball</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ibrahim</surname>
              <given-names>ZM</given-names>
            </name>
            <name name-style="western">
              <surname>Broadbent</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dzahini</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Johnston</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dobson</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Identification of adverse drug events from free text electronic patient records and information in a large mental health case register</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>8</issue>
          <fpage>e0134208</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0134208"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0134208</pub-id>
          <pub-id pub-id-type="medline">26273830</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-27426</pub-id>
          <pub-id pub-id-type="pmcid">PMC4537312</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Névéol</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dalianis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Velupillai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zweigenbaum</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Clinical natural language processing in languages other than English: opportunities and challenges</article-title>
          <source>J Biomed Semantics</source>
          <year>2018</year>
          <month>03</month>
          <day>30</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>12</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jbiomedsem.biomedcentral.com/articles/10.1186/s13326-018-0179-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13326-018-0179-8</pub-id>
          <pub-id pub-id-type="medline">29602312</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13326-018-0179-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC5877394</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
