<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i3e23983</article-id>
      <article-id pub-id-type="pmid">33783361</article-id>
      <article-id pub-id-type="doi">10.2196/23983</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>A Framework (SOCRATex) for Hierarchical Annotation of Unstructured Electronic Health Records and Integration Into a Standardized Medical Database: Development and Usability Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chu</surname>
            <given-names>Yuanchia</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yu</surname>
            <given-names>Yue</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Park</surname>
            <given-names>Jimyung</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2655-5517</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>You</surname>
            <given-names>Seng Chan</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5052-6399</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Jeong</surname>
            <given-names>Eugene</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8240-1198</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Weng</surname>
            <given-names>Chunhua</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9624-0214</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Park</surname>
            <given-names>Dongsu</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5614-3145</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Roh</surname>
            <given-names>Jin</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3995-355X</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Dong Yun</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3678-9862</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Cheong</surname>
            <given-names>Jae Youn</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6246-1783</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Choi</surname>
            <given-names>Jin Wook</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff8" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2396-4705</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>Kang</surname>
            <given-names>Mira</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff9" ref-type="aff">9</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7842-0035</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Park</surname>
            <given-names>Rae Woong</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <address>
            <institution>Department of Biomedical Informatics</institution>
            <institution>Ajou University School of Medicine</institution>
            <addr-line>164, World cup-ro, Yeongtong-gu, Suwon-si, Gyeonggi-do</addr-line>
            <addr-line>Suwon, 16499</addr-line>
            <country>Republic of Korea</country>
            <phone>82 31 219 4471</phone>
            <fax>82 31 219 4472</fax>
            <email>veritas@ajou.ac.kr</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4989-3287</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biomedical Sciences</institution>
        <institution>Ajou University Graduate School of Medicine</institution>
        <addr-line>Suwon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Preventive Medicine and Public Health</institution>
        <institution>Yonsei University College of Medicine</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>Vanderbilt University School of Medicine</institution>
        <addr-line>Nashville, TN</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>Columbia University</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>Ajou University School of Medicine</institution>
        <addr-line>Suwon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Pathology</institution>
        <institution>Ajou University Hospital</institution>
        <addr-line>Suwon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Department of Gastroenterology</institution>
        <institution>Ajou University School of Medicine</institution>
        <addr-line>Suwon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff8">
        <label>8</label>
        <institution>Department of Radiology</institution>
        <institution>Ajou University School of Medicine</institution>
        <addr-line>Suwon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff9">
        <label>9</label>
        <institution>Department of Digital Health</institution>
        <institution>Samsung Advanced Institute for Health Sciences &#38; Technology</institution>
        <institution>Sungkyunkwan University</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Rae Woong Park <email>veritas@ajou.ac.kr</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>3</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>30</day>
        <month>3</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>3</issue>
      <elocation-id>e23983</elocation-id>
      <history>
        <date date-type="received">
          <day>31</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>22</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>14</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>23</day>
          <month>1</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Jimyung Park, Seng Chan You, Eugene Jeong, Chunhua Weng, Dongsu Park, Jin Roh, Dong Yun Lee, Jae Youn Cheong, Jin Wook Choi, Mira Kang, Rae Woong Park. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 30.03.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/3/e23983" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Although electronic health records (EHRs) have been widely used in secondary assessments, clinical documents are relatively less utilized owing to the lack of standardized clinical text frameworks across different institutions.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to develop a framework for processing unstructured clinical documents of EHRs and integration with standardized structured data.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We developed a framework known as Staged Optimization of Curation, Regularization, and Annotation of clinical text (SOCRATex). SOCRATex has the following four aspects: (1) extracting clinical notes for the target population and preprocessing the data, (2) defining the annotation schema with a hierarchical structure, (3) performing document-level hierarchical annotation using the annotation schema, and (4) indexing annotations for a search engine system. To test the usability of the proposed framework, proof-of-concept studies were performed on EHRs. We defined three distinctive patient groups and extracted their clinical documents (ie, pathology reports, radiology reports, and admission notes). The documents were annotated and integrated into the Observational Medical Outcomes Partnership (OMOP)-common data model (CDM) database. The annotations were used for creating Cox proportional hazard models with different settings of clinical analyses to measure (1) all-cause mortality, (2) thyroid cancer recurrence, and (3) 30-day hospital readmission.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Overall, 1055 clinical documents of 953 patients were extracted and annotated using the defined annotation schemas. The generated annotations were indexed into an unstructured textual data repository. Using the annotations of pathology reports, we identified that node metastasis and lymphovascular tumor invasion were associated with all-cause mortality among colon and rectum cancer patients (both <italic>P</italic>=.02). The other analyses involving measuring thyroid cancer recurrence using radiology reports and 30-day hospital readmission using admission notes in depressive disorder patients also showed results consistent with previous findings.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We propose a framework for hierarchical annotation of textual data and integration into a standardized OMOP-CDM medical database. The proof-of-concept studies demonstrated that our framework can effectively process and integrate diverse clinical documents with standardized structured data for clinical research.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>search engine</kwd>
        <kwd>data curation</kwd>
        <kwd>data management</kwd>
        <kwd>common data model</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>With the universal adoption of electronic health records (EHRs), the secondary use of EHRs becomes important for translational research and improvement of the quality of health care [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. EHRs comprise structured (ie, diagnoses, medications, procedures, laboratory tests, and medical device use) and unstructured records, such as clinical notes with diverse formats. Structured data have been widely utilized owing to their processable and standardized codes. In an international open science initiative, Observational Health Data Sciences and Informatics (OHDSI), the structured data of more than 200 hospitals worldwide were mapped into a standardized vocabulary and data structure referred to as the Observational Medical Outcomes Partnership (OMOP)-common data model (CDM) [<xref ref-type="bibr" rid="ref4">4</xref>]. OHDSI is an open collaborative research community, and researchers from each country have collaborated for discovering medical knowledge. OMOP-CDM version 6.0 consists of 15 clinical data tables, four health system data tables, two health economics data tables, three derived tables, and 10 vocabulary tables. All of the tables are represented with standardized medical terminologies. Using the OMOP-CDM, OHDSI has generated medical evidence through large-scale observational research [<xref ref-type="bibr" rid="ref5">5</xref>], which can be achieved by the software and user interface to facilitate standardized phenotyping [<xref ref-type="bibr" rid="ref6">6</xref>], statistical analysis [<xref ref-type="bibr" rid="ref7">7</xref>], and machine-learning application [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>Clinical notes with natural language are keeping invaluable information that is not in available structured data, such as clinician’s thoughts and medical profiles [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Although textual data can complement structured data and provide reliable clinical evidence, consistently processing textual data across multiple hospitals has been profoundly restricted. To process unstructured textual data, natural language processing (NLP) technology, an area of computer science for transforming human linguistics into a machine-readable form, is required [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Clinical documents in the OMOP-CDM have not been actively used for research in OHDSI because of difficulties in consistently processing the textual data and lack of standardized text mining pipelines. Therefore, a standardized clinical text framework for extracting, processing, and annotating unstructured clinical documents is essential to maximize the usefulness of the large body of clinical data in the OMOP-CDM format around the world.</p>
        <p>One of the primary streams of clinical NLP is named entity recognition (NER), which extracts information of interest based on annotation schemas [<xref ref-type="bibr" rid="ref14">14</xref>]. However, most NER studies have used a relatively narrow schema that permits restricted relationships and categories of medical concepts. The restricted medical concepts indicate that only limited information can be extracted from the narratives [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Conversely, hierarchical annotation leverages a multilevel data structure to extract a wide range of information. Users can richly annotate clinical notes and facilitate the annotations for various purposes. For example, the multilevel structure can contain the hierarchy and relations of the observed tumor, differentiation, gross type, invasion, size, and other characteristics, while the narrow schema cannot include this information. This rich information can be extracted through the hierarchical schema and can be facilitated for answering a variety of research questions. Therefore, a hierarchical annotation schema is more desirable for clinical research [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref20">20</xref>].</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>One of the attempts to standardize diverse EHR formats into CDM is the Sentinel project. Sentinel and its component (ie, Mini-Sentinel) have been developed by the United States Food and Drug Administration (FDA), with the aim to create an active surveillance system for monitoring the safety of medical products [<xref ref-type="bibr" rid="ref21">21</xref>]. Sentinel is a US domestic data model, and the OMOP-CDM was used in this study because of its international research network and wide coverage of standardized medical terminology [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>In the aspects of NLP frameworks, many NLP information extraction and retrieval systems have been developed to process documents in EHRs for use in clinical practice or research. EMERSE is a clinical note searching system developed using Apache Lucene to increase the availability of EHRs and to help clinicians and researchers effectively retrieve information [<xref ref-type="bibr" rid="ref23">23</xref>]. SemEHR provides a biomedical information extraction and semantic search system for clinical notes, and several case studies have proven the system’s usability [<xref ref-type="bibr" rid="ref24">24</xref>]. SemEHR facilitates Fast Healthcare Interoperability Resources (FHIR) to represent the clinical semantic concepts extracted from free text. cTAKES and CLAMP are widely used NLP systems that provide serial components for information extraction [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. CREATE is an information retrieval system based on the OMOP-CDM for executing textual cohort selection queries on structured and unstructured data [<xref ref-type="bibr" rid="ref27">27</xref>]. On the other hand, Sharma et al proposed a phenotyping system with NLP algorithms to extract features from the clinical documents of the OMOP-CDM database [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
        <p>Despite well-performing systems, using the systems is still difficult since the systems require high optimization for the local environment and extensive domain knowledge [<xref ref-type="bibr" rid="ref29">29</xref>]. Moreover, clinical note extraction and preprocessing are needed separately from the systems. The lack of a user interface limits the systems’ usability and portability. Hence, clinical NLP systems that can be applied to standardized medical databases and provide serial NLP components to enhance research continuity are required for users. In this study, we chose the OMOP-CDM owing to its wide coverage of standardized medical terminology and worldwide distributed research networks.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>This study aimed to integrate unstructured clinical textual data with structured data through the framework referred to as Staged Optimization of Curation, Regularization, and Annotation of clinical text (SOCRATex). The proposed framework was designed (1) to define a flexible hierarchical annotation schema containing complex clinical information through efficient chart review, (2) to generate reusable annotations based on user-configurable JavaScript object notation (JSON) architecture, and (3) to construct a clinical text data repository that can be integrated with the standardized structured data.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>System Architecture</title>
        <p>SOCRATex follows a pipeline-based architecture with the following four stages: (1) extracting clinical notes for the target population and preprocessing the data, (2) defining the annotation schema with a hierarchical structure by referring clustered topics from the clinical notes; (3) performing document-level hierarchical annotation; and (4) constructing a textual data repository with a search engine (<xref rid="figure1" ref-type="fig">Figure 1</xref>). All source codes are available online [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The overall system architecture of Staged Optimization of Curation, Regularization, and Annotation of clinical text (SOCRATex). The system has the following four stages: (1) extracting clinical notes for the target population and preprocessing the data, (2) defining the annotation schema with a hierarchical structure, (3) performing document-level hierarchical annotation using the annotation schema, and (4) indexing annotations for a search engine system. CDM: common data model; EHR: electronic health record; OMOP: Observational Medical Outcomes Partnership.</p>
          </caption>
          <graphic xlink:href="medinform_v9i3e23983_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Stage 1: Data Extraction and Preprocessing</title>
        <p>In the first stage of SOCRATex, the user defines the target population. OHDSI provides an open-source software stack known as ATLAS, which enables users to define complex and transferrable phenotypes of interest based on structured data (ie, diagnosis, medication prescription, medical device use, and laboratory measurements) [<xref ref-type="bibr" rid="ref31">31</xref>]. The documents in the OMOP-CDM are fully connected to other structured data through patient identifiers. Information regarding note type, language, and encoding system are stored with a fully standardized vocabulary [<xref ref-type="bibr" rid="ref32">32</xref>]. </p>
        <p>In the NOTE table, foreign keys that can be connected with other tables exist in the CDM (Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). NOTE_EVENT_ID is a foreign key identifier of the event (ie, drug exposure, visit, and procedure) during which the note was recorded. NOTE_EVENT_FIELD_CONCEPT_ID is a standardized vocabulary showing which NOTE_EVENT_ID is being referred to. NOTE_TYPE_CONCEPT_ID represents the type, origin, or provenance of the recorded clinical notes. SOCRATex extracts a certain type of clinical document for the target population by using NOTE_TYPE_CONCEPT_ID.</p>
        <p>The developed framework provides conventional preprocessing functions, such as eliminating stop words, white spaces, numbers, and punctuations; changing text to lowercase; stemming; and generating a document-term matrix. SOCRATex users can add specific regular expressions or terms to the stop words list.</p>
      </sec>
      <sec>
        <title>Stage 2: Defining the Annotation Schema With a Hierarchical Structure</title>
        <p>To define an annotation schema for organizing hierarchical entities of medical documents, researchers with domain knowledge need to review the overall documents of interest thoroughly. By leveraging latent Dirichlet allocation (LDA), which clusters similar words based on the word distributions over documents, SOCRATex automatically identifies topic clusters among documents of interest and provides samples of each cluster to researchers [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. It is assumed that the sampled documents can represent the semantic characteristics of the extracted documents because the topic clusters represent the latent semantics of the documents. Therefore, reviewing the samples can suggest an efficient chart review process rather than reviewing the documents. This reduces redundant labor for reviewing charts of similar content to understand the documents of interest comprehensively.</p>
        <p>To calculate the optimal number of topics in LDA, we used perplexity scores, a statistical measure for probabilistic models. Users can decide the best hyperparameters for LDA performance based on the perplexity scores [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. For the interpretation of LDA results, SOCRATex shows both words and documents from their associated topics (Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Based on LDA topics, users can define the annotation schema using the JSON architecture, a machine readable and hierarchical architecture consisting of entity-value pairs.</p>
      </sec>
      <sec>
        <title>Stage 3: Document-Level Annotation With a Defined Schema</title>
        <p>Manual annotation is notorious for being an error-prone process. To limit the errors and ensure annotation quality, we applied the JSON schema that can restrict the values and data types of annotation entities [<xref ref-type="bibr" rid="ref40">40</xref>]. Users need to specify the allowed values of annotation entities using the JSON format. For instance, diameters of observed tumors can be restricted to numeric values. The annotation schema can be distributed to other institutions for generating homogeneous annotations.</p>
      </sec>
      <sec>
        <title>Stage 4: Constructing a Textual Data Repository for Data Exploration and Retrieval</title>
        <p>Elastic Stack, a group of open-source products specialized in textual data exploration and retrieval, is used for constructing a textual data repository for the annotations. Elastic Stack is composed of Elasticsearch and Kibana. Elasticsearch is a full-text search and analytics engine for textual data, and Kibana is its visualization dashboard [<xref ref-type="bibr" rid="ref41">41</xref>]. SOCRATex can index the generated annotations into Elasticsearch, and users can explore their data using Kibana (Figure S1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p>
      </sec>
      <sec>
        <title>Validation Using EHRs</title>
        <p>We applied SOCRATex against hospital data to validate the usability of the framework. The following three distinctive groups were defined using the OMOP-CDM database of Ajou University School of Medicine [<xref ref-type="bibr" rid="ref42">42</xref>]: (1) patients who were diagnosed with malignant neoplasms of the colon and rectum between 2014 and 2017, (2) patients who were diagnosed with malignant neoplasms of the thyroid gland and who underwent thyroidectomy between 2014 and 2016, and (3) patients who were diagnosed with major depressive disorder and hospitalized via the emergency department between 2012 and 2018.</p>
        <p>From each group of patients, we extracted a specific type of clinical note. Among the patients with colorectal cancer, we extracted their pathology reports with the statement of cancerous lesions of the colon and rectum. Radiology reports of postoperative thyroid ultrasonography were extracted for the patients who underwent thyroidectomy owing to thyroid cancer. Among the patients with major depressive disorder, admission notes were selected and identified with a description of the reason for hospitalization.</p>
        <p>Each note type was selected because of its different characteristics (<xref rid="figure2" ref-type="fig">Figure 2</xref>). Pathology reports have a semistructured format that is similar to the synoptic pathology reporting form and are primarily written in English [<xref ref-type="bibr" rid="ref43">43</xref>]. Radiology reports feature a semistructured data format and narrative sentences. Admission notes have narrative descriptions of medical history, disease diagnosis, and medication prescription of the patients. Korean characters were removed and only English characters were included for topic modeling analysis. During the annotation process, we used both languages for accurate annotation. To evaluate the accuracy and efficiency of the SOCRATex annotation process, we compared the annotation process of our system and traditional manual chart review.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Examples of annotating certain types of clinical documents and their annotation process. Pathology reports have a semistructured format, and radiology reports have a semistructured format with narrative sentences. Admission notes have narrative descriptions in both Korean and English.</p>
          </caption>
          <graphic xlink:href="medinform_v9i3e23983_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Both structured and unstructured textual data were deidentified to protect patient data. The OMOP-CDM per se is a pseudonymized data model that does not allow identifying specific individuals with the data. Hence, it is compliant with pseudonymization of the EU General Data Protection Regulation and Health Insurance Portability and Accountability Act of 1996 (HIPAA) regulations [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. Moreover, the deidentification process in Ajou University Hospital was applied to the data sets to ensure privacy protection (Figure S1 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). With the process, patient IDs are encrypted and only the researcher with IRB approval is allowed to receive decryption keys. However, the unstructured textual data can still contain private information. Therefore, a rule-based algorithm was applied to eliminate HIPAA-defined protected health information (PHI) and Korean PHI from the narratives (Tables S1 and S2 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). We applied the algorithm by Shin et al that was developed on bilingual clinical documents (ie, Korean and English), was validated on 5000 notes of 33 types, and showed 99.87% precision [<xref ref-type="bibr" rid="ref46">46</xref>]. The rules for the data set of this study were then optimized and updated.</p>
        <p>As proof-of-concept studies, we performed survival analyses to measure mortality rates, cancer recurrence, and hospital readmission using information from both structured clinical data and medical narratives. All-cause mortality, thyroid cancer diagnoses, and hospital readmission information were extracted from structured coded data and defined as outcomes of the analyses. From the annotations, we extracted the following clinical features that were not in structured data: node metastasis, lymphovascular tumor invasion, echogenicity of thyroid nodules, and episodes and specifiers of major depressive disorder. The episodes and specifiers were measured using the Diagnostics and Statistical Manual of Mental Disorder (DSM-5) [<xref ref-type="bibr" rid="ref47">47</xref>]. Furthermore, we calculated the Korean Thyroid Imaging Reporting and Data System (K-TIRADS) score, a risk stratification of thyroid nodules using the extracted covariates (ie, size, content, and echogenicity of thyroid nodules) [<xref ref-type="bibr" rid="ref48">48</xref>]. A high K-TIRADS score indicates that the observed thyroid lesions are suspected to be malignant.</p>
        <p>In patients diagnosed with colon and rectum cancer, we measured all-cause mortality stratified by node metastasis and lymphovascular invasion. Thyroid cancer recurrence in patients who underwent thyroidectomy was measured with the K-TIRADS score and echogenicity on ultrasonography. Among the patients with major depressive disorder, hospital readmission was measured with specifiers and episodes of major depressive disorder. The <italic>P</italic> value of the log-rank test with Kaplan-Meier curves was measured on each annotation body. We used Cox proportional hazard models to assess and calculate the hazard ratio (HR) between the defined groups. HRs are presented with 95% CIs and <italic>P</italic> values. All <italic>P</italic> values &#60;.05 were considered statistically significant.</p>
        <p>To demonstrate external feasibility, we applied SOCRATex to pathology reports from another tertiary hospital’s OMOP-CDM database. This study was approved by the Institutional Review Board at Ajou University Hospital (IRB approval number: AJIRB-MED-MDB-19-579).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Stage 1: Defining Patient Groups and Extracting Clinical Documents</title>
        <p>Overall, 600 pathology reports from 588 patients with colon and rectum cancer, 308 radiology reports from 220 patients who underwent thyroidectomy, and 147 admission notes from 145 patients with major depressive disorder were included in the study. The characteristics of the patients are shown in <xref ref-type="table" rid="table1">Table 1</xref>. To compare the cohorts, medical history of the patients was extracted using structured coded data.</p>
        <p>Moreover, the information loss and accuracy of clinical note extraction were investigated (Tables S1, S2, and S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). It showed that data sparsity dropped less than 1% in pathology and radiology reports and 4% in admission notes despite eliminating non-English character removal. The most frequent tokens in documents usually consisted of English characters and a few Korean characters, such as “환자는 (the patient),” “하였다 (did),” and “정신과 (department of psychiatry).”</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Baseline characteristics of the patient groups.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="300"/>
            <col width="0"/>
            <col width="190"/>
            <col width="0"/>
            <col width="180"/>
            <col width="0"/>
            <col width="180"/>
            <col width="0"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Characteristic</td>
                <td colspan="2">Patients with pathology reports<break/>(n=588)</td>
                <td colspan="2">Patients with radiology reports<break/>(n=220)</td>
                <td colspan="2">Patients with psychiatric admission notes<break/>(n=145)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">Age (years), mean (SD)</td>
                <td colspan="2">62.65 (12.58)</td>
                <td colspan="2">46.52 (18.69)</td>
                <td colspan="2">49.12 (19.59)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="3">Female, n (%)</td>
                <td colspan="2">229 (38.9)</td>
                <td colspan="2">176 (80.0)</td>
                <td colspan="2">107 (73.8)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>General medical history, n (%)</bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Dementia</td>
                <td colspan="2">6 (1.1)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">.23</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gastroesophageal reflux disease</td>
                <td colspan="2">9 (1.5)</td>
                <td colspan="2">8 (3.6)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">.03</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gastrointestinal hemorrhage</td>
                <td colspan="2">31 (5.3)</td>
                <td colspan="2">1 (0.5)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hyperlipidemia</td>
                <td colspan="2">9 (1.5)</td>
                <td colspan="2">11 (5.0)</td>
                <td colspan="2">3 (2.1)</td>
                <td colspan="2">&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hypertensive disorder</td>
                <td colspan="2">165 (28.1)</td>
                <td colspan="2">15 (6.8)</td>
                <td colspan="2">2 (1.4)</td>
                <td colspan="2">&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Diabetes mellitus</td>
                <td colspan="2">84 (14.3)</td>
                <td colspan="2">18 (8.2)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Renal impairment</td>
                <td colspan="2">22 (3.7)</td>
                <td colspan="2">3 (1.4)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">.01</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Liver lesion</td>
                <td colspan="2">30 (5.1)</td>
                <td colspan="2">1 (0.5)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">&#60;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Cardiovascular disease, n (%)</bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Atrial fibrillation</td>
                <td colspan="2">11 (1.9)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">1 (0.7)</td>
                <td colspan="2">.08</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cerebrovascular disease</td>
                <td colspan="2">6 (1.0)</td>
                <td colspan="2">1 (0.5)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">.64</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Coronary arteriosclerosis</td>
                <td colspan="2">10 (1.7)</td>
                <td colspan="2">3 (1.4)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">.34</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Heart disease</td>
                <td colspan="2">39 (6.6)</td>
                <td colspan="2">8 (3.6)</td>
                <td colspan="2">1 (0.7)</td>
                <td colspan="2">.008</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Heart failure</td>
                <td colspan="2">7 (1.2)</td>
                <td colspan="2">2 (0.9)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">.45</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ischemic heart disease</td>
                <td colspan="2">16 (2.7)</td>
                <td colspan="2">2 (0.9)</td>
                <td colspan="2">0 (0.0)</td>
                <td colspan="2">.048</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Peripheral vascular disease</td>
                <td colspan="2">10 (1.7)</td>
                <td colspan="2">3 (1.4)</td>
                <td colspan="2">1 (0.7)</td>
                <td colspan="2">.86</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Stage 2: Defining the Annotation Schema With a Hierarchical Structure</title>
        <p>The optimal number of topics for pathology reports was determined to be 5, whereas the optimal number of both radiology reports and admission notes was 4 (Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
        <p>We defined a hierarchical schema of pathology reports based on the topics and sample documents (<xref rid="figure3" ref-type="fig">Figure 3</xref>). The entities of pathology reports were classified into the following three groups: lesions, lymph nodes, and biomarker tests. Each entity has a multilevel structure, especially the invasion entity, which showed a deep multilevel structure containing the hierarchical information of lymphatic, vascular, and perineural invasion, and resection margin. The annotation schemas of radiology reports and admission notes are shown in Figures S3 and S4 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Overall, 23 entities were defined for pathology reports, 20 entities for radiology reports, and 5 entities for admission notes.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Defining a hierarchical annotation schema of pathology reports, which describes lesions of colon and rectum cancer. The process had the following three steps: (1) classifying documents using clustered topics from the latent Dirichlet allocation model, (2) identifying medical entities of interest, and (3) designing the annotation schema. PCR: polymerase chain reaction; PNA: peptide nucleic acid.</p>
          </caption>
          <graphic xlink:href="medinform_v9i3e23983_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Stage 3: Document-Level Annotation With a Defined Schema</title>
        <p>Document-level annotation was applied on the extracted documents, resulting in the annotation of 1055 clinical documents with the defined schema. A total of 1000 colonoscopy pathology reports from another tertiary hospital database were annotated with the distributed annotation schema (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). The comparison between SOCRATex annotation and traditional chart review is described in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. It shows that the mean accuracy of traditional chart review was 0.917 and its mean annotation time was 548 minutes. On the other hand, the mean accuracy of SOCRATex annotation was 0.937 and its mean annotation time was 360 minutes.</p>
      </sec>
      <sec>
        <title>Stage 4: Constructing a Textual Data Repository for Data Exploration and Retrieval</title>
        <p>The generated annotations were indexed into Elasticsearch to construct a textual data repository. Table S1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> demonstrates that the admission notes were identified as having the largest tokens (24,319 tokens) and that the radiology reports were identified as having 1006 tokens. The tokens of pathology reports were 3561.</p>
        <p>Using the constructed textual data repository, we explored the entity distributions of the annotations using the Kibana interface (<xref rid="figure4" ref-type="fig">Figure 4</xref>). <xref rid="figure4" ref-type="fig">Figure 4</xref>A shows the distributions of pathology entities. It shows that adenocarcinoma was the most frequent tumor, which was observed in 412 of 600 documents (68.7%). Tubular and tubulovillous adenomas were the second most frequent tumors, which were observed in 186 (31.0%) and 48 (8.0%) documents, respectively. Among the biomarker tests, the microsatellite instability test was identified as the most frequent biomarker test with 90 (50.3%) occurrences, followed by epidermal growth factor receptor with 85 (47.5%) occurrences. The distributions of radiology entities showed that solid or predominantly solid thyroid nodules were observed in 34 of 148 documents, in which 209 (16.2%) nodules were observed via thyroid ultrasonography (<xref rid="figure4" ref-type="fig">Figure 4</xref>B). There were only 4 (2.70%) documents describing cystic or predominantly cystic nodules. Of 144 observed lesions with nodule size, 20 (14.1%) nodules were larger than 2.0 cm and the other 122 (85.9%) nodules were less than 2.0 cm. Using the DSM-5, we identified the severity, episode, and specifier of major depressive disorder from admission notes (<xref rid="figure4" ref-type="fig">Figure 4</xref>C). As a result, 52 (35.4%) hospitalized cases and 33 (22.5%) cases were identified as having anxious distress of major depressive disorder and psychotic or mood-congruent psychotic features, respectively. In addition, we identified the medication usage patterns of patients. The most frequently prescribed medication was alprazolam with 66 (22.6%) prescriptions, followed by escitalopram with 35 (11.3%) prescriptions.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Histograms of annotation entities derived from pathology reports (A), radiology reports (B), and admission notes (C). (A) shows the number of observed histologies, differentiations, procedures, and biomarkers; (B) shows the number of locations, impressions, contents, and diameters of the observed thyroid nodules; and (C) shows the specifiers, episodes, severities, and used medications in major depressive disorder patients.</p>
          </caption>
          <graphic xlink:href="medinform_v9i3e23983_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Hierarchical annotations can show further relationships between the entities. <xref rid="figure5" ref-type="fig">Figure 5</xref> describes the hierarchical relations of the entities. Using Kibana queries, we classified each annotation body into two categories. First, the observed tumors and the differentiation from pathology report findings are described (<xref rid="figure5" ref-type="fig">Figure 5</xref>A and 5B). Both are distinguished by lymph node positivity. Among moderately differentiated adenocarcinomas, 29 (71.6%) lymph node–positive cases and 45 (66.6%) lymph node–negative cases were observed. Second, frequent differentiation of adenocarcinoma differed according to lymph node involvement as follows: 6 (14.8%) poorly differentiated in lymph node–positive cases and 11 (17.4%) well differentiated in lymph node–negative cases. Additionally, relations of thyroid nodule types and contents by anatomic locations are described in <xref rid="figure5" ref-type="fig">Figure 5</xref>C and 5D. The results show that solid nodules with malignancy were observed in 7 (16.3%) cases in the left thyroid and 10 (14.1%) cases in the right thyroid. On the contrary, benign cystic thyroid nodules were observed in only 2 (2.6%) cases in the left thyroid and 2 (1.7%) cases in the right thyroid. Third, the specifiers and severities of major depressive disorder were identified (<xref rid="figure5" ref-type="fig">Figure 5</xref>E and 5F). The results were divided according to single or recurrent episodes of major depressive disorder. Among the patients with single episodes of the disease, 21 (45.7%) cases were identified as involving severe major depressive disorder with an anxious distress specifier. Additionally, 20 (51.3%) cases of multiple episodes were identified as involving an anxious distress specifier with severe symptoms.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Sunburst plots generated using the Kibana interface. (A) and (B) show the observed histologies and their differentiation from pathology reports. (A) shows the results of lymph node–positive cases, and (B) shows the results of lymph node–negative cases. (C) and (D) are observed from radiology reports. Each of the plots indicates the left and right thyroid in order. (E) and (F) show the disease specifier and its severity from the admission notes. (E) shows the results of single-episode patients, and (F) shows the results of multiple-episode patients.</p>
          </caption>
          <graphic xlink:href="medinform_v9i3e23983_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The annotation results of the other tertiary hospital database are described in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. Tubular adenoma observed at the sigmoid colon was the most frequent histology with 131 cases among 720 observed lesions (20.1%), and hyperplastic polyps represented the second most frequent histology in the sigmoid colon with 76 (11.7%) cases.</p>
      </sec>
      <sec>
        <title>Association of Features From Clinical Notes and Structured Data</title>
        <p>For patients diagnosed with malignant neoplasm of the colon and rectum, 5-year survival analyses were performed (<xref rid="figure6" ref-type="fig">Figure 6</xref>A and 6B). The analyses measured mortality rate according to node metastasis and lymphovascular tumor invasion. We found that patients with lymph node involvement had significantly worse survival rates than those without involvement (HR 5.22, 95% CI 1.08-25.22; <italic>P</italic>=.04). Lymphovascular invasion was also associated with significantly higher mortality in patients with colorectal cancer (HR 3.75, 95% CI 1.14-12.32; <italic>P</italic>=.03).</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Kaplan-Meier curves with P values of the log-rank test. Survival analyses were performed. (A) and (B) measure 5-year mortality rates of patients with colorectal cancer by node metastasis and lymphovascular tumor invasion, respectively. (C) and (D) measure thyroid cancer recurrence by echogenicity of thyroid nodules and K-TIRADS scores, respectively. (E) and (F) measure 30-day readmission of patients with major depressive disorder by disease specifiers and episodes, respectively. K-TIRADS: Korean Thyroid Imaging Reporting and Data System.</p>
          </caption>
          <graphic xlink:href="medinform_v9i3e23983_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Recurrence risk of thyroid cancer stratified by the echogenicity of thyroid nodules and the K-TIRADS score was measured (<xref rid="figure6" ref-type="fig">Figure 6</xref>C and 6D). In our analysis, recurrence of thyroid cancer was not significantly associated with the echogenicity of thyroid nodules (HR 0.80, 95% CI 0.16-3.98; <italic>P</italic>=.78). On the other hand, we found that high K-TIRADS scores (K-TIRADS 3 and 4) were associated with a higher risk of thyroid cancer recurrence compared with low K-TIRADS scores (K-TIRADS 1-3) (HR 12.43, 95% CI 2.73-56.60; <italic>P</italic>&#60;.001).</p>
        <p>Among patients with major depressive disorder, we measured 30-day readmission according to disease specifiers and episodes, which were measured based on the DSM-5 (<xref rid="figure6" ref-type="fig">Figure 6</xref>E and 6F). The specifiers were classified into anxious distress and psychotic features. Disease episodes were classified into single or recurrent episodes. The results showed that 30-day readmission was not significantly associated with the specifiers of major depressive disorder (HR 1.07, 95% CI 0.50-2.26; <italic>P</italic>=.87). Single or recurrent episodes of major depressive disorder were not significantly associated with 30-day readmission (HR 0.78, 95% CI 0.47-1.29; <italic>P</italic>=.34).</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The framework succeeded in hierarchically annotating unstructured clinical documents and integrating them into standardized structured data. Through proof-of-concept studies, three different types of clinical documents (ie, pathology reports, radiology reports, and admission notes) were extracted and processed with topic modeling to identify medical concepts. The hierarchical schemas were defined with efficient chart review by sampling documents according to semantic topics. Overall, 1055 documents were manually annotated using the schemas and indexed in the search engine. We attempted multidimensional validation by identifying the characteristics of the hierarchical annotations and by performing survival analyses with integrated data of structured and unstructured textual information. The following were identified through validation: (1) the association of node positivity with mortality in patients with colorectal cancer, (2) the association of the K-TIRADS score with thyroid cancer mortality, and (3) medication usage patterns according to depression episodes.</p>
        <p>SOCRATex uses flexible annotation schemas for clinical text annotation that can include complex information in free-text documents (<xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>). The narrow annotation schema can only extract the entities of disease, treatment, and test [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. These simple entities are effective to annotate and train the model, but difficult to explain their relationships. On the contrary, the annotation schema on pathology reports successfully contained the relationships among tumor type, dimension, location, and invasion. Consequently, we identified that more than 42% (253/588) of colorectal cancer patients had moderately differentiated adenocarcinoma and underwent a microsatellite instability test. In the radiology reports, 23% (34/148) of thyroid nodules were identified as having solid content. The hierarchical schema of admission notes identified medication usage patterns by disease episodes, showing that alprazolam and escitalopram were the most frequently prescribed medications in both patient groups.</p>
        <p>Through proof-of-concept studies, we demonstrated that the generated hierarchical annotations could be used in various settings of clinical research. The survival analyses of patients with colorectal cancer showed that node positivity and lymphovascular invasion were significantly associated with a higher mortality rate, which is consistent with the findings of previous studies [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. The analyses of radiology reports found that higher K-TIRADS scores were significantly associated with the recurrence of thyroid cancer, which is consistent with previous reports [<xref ref-type="bibr" rid="ref48">48</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations that can direct future research. First, interesting clinical implications were not determined from our proof-of-concept studies. To discover novel medical evidence, a sophisticated study design is required. However, our aim here was to demonstrate that the generated textual data repository could be used for clinical research. Second, the feasibility of the framework in the distributed research network was not fully validated. Still, we distributed the annotation schema of pathology reports to the other institution and were able to annotate 1000 colonoscopy pathology reports. Third, the defined annotation schema was not systemically evaluated. Three annotation schemas were defined with domain experts according to their related clinical domains. However, systematic validation of the schemas is still required. Moreover, the applicability of FHIR standards in the system of this study will be investigated to test its extensibility.</p>
        <p>Although the generated annotations can be reused for clinical analyses of various purposes, the initial manual annotation of documents is still a time-consuming and costly process. In future work, state-of-the-art algorithms, such as BERT, XLNet, and GPT-3, could be applied to automatic information extraction processes to reduce the annotation burden and cost [<xref ref-type="bibr" rid="ref51">51</xref>-<xref ref-type="bibr" rid="ref53">53</xref>].</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We propose a clinical text processing framework to generate flexible hierarchical annotations and integrate them with the standardized structured data of the OMOP-CDM. The proof-of-concept studies demonstrated that the generated annotations were integrated with the structured data and were successfully used for various clinical research approaches with efficient chart review processes. The conformance with CDM allows the application of a standard annotation schema to generate homogeneous annotations from different institutions.</p>
        <p/>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Clinical note data extraction, processing, and validation.</p>
        <media xlink:href="medinform_v9i3e23983_app1.docx" xlink:title="DOCX File , 745 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Evaluating the latent Dirichlet allocation model performance and defining annotation schemas.</p>
        <media xlink:href="medinform_v9i3e23983_app2.docx" xlink:title="DOCX File , 1067 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Staged Optimization of Curation, Regularization, and Annotation of clinical text (SOCRATex) annotation and information retrieval system.</p>
        <media xlink:href="medinform_v9i3e23983_app3.docx" xlink:title="DOCX File , 519 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Protecting and deidentifying patient information.</p>
        <media xlink:href="medinform_v9i3e23983_app4.docx" xlink:title="DOCX File , 434 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Study results from Samsung Medical Center.</p>
        <media xlink:href="medinform_v9i3e23983_app5.docx" xlink:title="DOCX File , 239 KB"/>
      </supplementary-material>
      <supplementary-material id="app6">
        <label>Multimedia Appendix 6</label>
        <p>Comparison between Staged Optimization of Curation, Regularization, and Annotation of clinical text (SOCRATex) annotation and traditional chart review.</p>
        <media xlink:href="medinform_v9i3e23983_app6.docx" xlink:title="DOCX File , 14 KB"/>
      </supplementary-material>
      <supplementary-material id="app7">
        <label>Multimedia Appendix 7</label>
        <p>Comparison between Staged Optimization of Curation, Regularization, and Annotation of clinical text (SOCRATex) and other natural language processing systems.</p>
        <media xlink:href="medinform_v9i3e23983_app7.docx" xlink:title="DOCX File , 16 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CDM</term>
          <def>
            <p>common data model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">DSM-5</term>
          <def>
            <p>Diagnostics and Statistical Manual of Mental Disorder</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">FHIR</term>
          <def>
            <p>Fast Healthcare Interoperability Resources</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">HIPAA</term>
          <def>
            <p>Health Insurance Portability and Accountability Act</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">HR</term>
          <def>
            <p>hazard ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">JSON</term>
          <def>
            <p>JavaScript object notation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">K-TIRADS</term>
          <def>
            <p>Korean Thyroid Imaging Reporting and Data System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">LDA</term>
          <def>
            <p>latent Dirichlet allocation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">OHDSI</term>
          <def>
            <p>Observational Health Data Sciences and Informatics</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">OMOP</term>
          <def>
            <p>Observational Medical Outcomes Partnership</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">PHI</term>
          <def>
            <p>protected health information</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SOCRATex</term>
          <def>
            <p>Staged Optimization of Curation, Regularization, and Annotation of clinical text</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Bio Industrial Strategic Technology Development Program (20001234, 20003883) funded by the Ministry of Trade, Industry &#38; Energy (MOTIE, Korea) and a grant from the Korea Health Technology R&#38;D Project through the Korea Health Industry Development Institute (KHIDI) funded by the Ministry of Health &#38; Welfare, Republic of Korea (grant number: HI16C0992, HI19C0872).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>SCY, JP, and RWP contributed to the study design. JR, DYL, JYC, JWC, MK, and RWP obtained the relevant data used for the study. JP and DP contributed to the development and evaluation of SOCRATex. JP, SCY, EJ, CW, and RWP contributed to writing and revising the paper. All authors contributed to the writing and final approval of this manuscript. JP and SCY contributed equally to this work.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blumenthal</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Launching HITECH</article-title>
          <source>N Engl J Med</source>
          <year>2010</year>
          <month>02</month>
          <day>04</day>
          <volume>362</volume>
          <issue>5</issue>
          <fpage>382</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMp0912825</pub-id>
          <pub-id pub-id-type="medline">20042745</pub-id>
          <pub-id pub-id-type="pii">NEJMp0912825</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hripcsak</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing and its future in medicine</article-title>
          <source>Acad Med</source>
          <year>1999</year>
          <month>08</month>
          <volume>74</volume>
          <issue>8</issue>
          <fpage>890</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="doi">10.1097/00001888-199908000-00012</pub-id>
          <pub-id pub-id-type="medline">10495728</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kralewski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hammons</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Dowd</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Medical groups' adoption of electronic health records and information systems</article-title>
          <source>Health Aff (Millwood)</source>
          <year>2005</year>
          <month>09</month>
          <volume>24</volume>
          <issue>5</issue>
          <fpage>1323</fpage>
          <lpage>33</lpage>
          <pub-id pub-id-type="doi">10.1377/hlthaff.24.5.1323</pub-id>
          <pub-id pub-id-type="medline">16162580</pub-id>
          <pub-id pub-id-type="pii">24/5/1323</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hripcsak</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Duke</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Reich</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Huser</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Suchard</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>ICK</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>van der Lei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pratt</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Norén</surname>
              <given-names>GN</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Stang</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Madigan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>PB</given-names>
            </name>
          </person-group>
          <article-title>Observational Health Data Sciences and Informatics (OHDSI): Opportunities for Observational Researchers</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2015</year>
          <volume>216</volume>
          <fpage>574</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26262116"/>
          </comment>
          <pub-id pub-id-type="medline">26262116</pub-id>
          <pub-id pub-id-type="pmcid">PMC4815923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Suchard</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Krumholz</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>You</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pratt</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Reich</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Duke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Madigan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hripcsak</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>PB</given-names>
            </name>
          </person-group>
          <article-title>Comprehensive comparative effectiveness and safety of first-line antihypertensive drug classes: a systematic, multinational, large-scale analysis</article-title>
          <source>The Lancet</source>
          <year>2019</year>
          <month>11</month>
          <day>16</day>
          <volume>394</volume>
          <issue>10211</issue>
          <fpage>1816</fpage>
          <lpage>1826</lpage>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(19)32317-7</pub-id>
          <pub-id pub-id-type="medline">31668726</pub-id>
          <pub-id pub-id-type="pii">S0140-6736(19)32317-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6924620</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Hripcsak</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Deep phenotyping: Embracing complexity and temporality-Towards scalability, portability, and interoperability</article-title>
          <source>J Biomed Inform</source>
          <year>2020</year>
          <month>05</month>
          <volume>105</volume>
          <fpage>103433</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32335224"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2020.103433</pub-id>
          <pub-id pub-id-type="medline">32335224</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(20)30061-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC7179504</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Hripcsak</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Madigan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Suchard</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Improving reproducibility by using high-throughput observational studies with empirical calibration</article-title>
          <source>Philos Trans A Math Phys Eng Sci</source>
          <year>2018</year>
          <month>09</month>
          <day>13</day>
          <volume>376</volume>
          <issue>2128</issue>
          <fpage>20170356</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30082302"/>
          </comment>
          <pub-id pub-id-type="doi">10.1098/rsta.2017.0356</pub-id>
          <pub-id pub-id-type="medline">30082302</pub-id>
          <pub-id pub-id-type="pii">rsta.2017.0356</pub-id>
          <pub-id pub-id-type="pmcid">PMC6107542</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reps</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Suchard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>08</month>
          <day>01</day>
          <volume>25</volume>
          <issue>8</issue>
          <fpage>969</fpage>
          <lpage>975</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29718407"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocy032</pub-id>
          <pub-id pub-id-type="medline">29718407</pub-id>
          <pub-id pub-id-type="pii">4989437</pub-id>
          <pub-id pub-id-type="pmcid">PMC6077830</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ford</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Nicholson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Koeling</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tate</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Axelrod</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>Rait</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Davies</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Petersen</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Cassell</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Optimising the use of electronic health records to estimate the incidence of rheumatoid arthritis in primary care: what information is hidden in free text?</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2013</year>
          <month>08</month>
          <day>21</day>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>105</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/1471-2288-13-105"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2288-13-105</pub-id>
          <pub-id pub-id-type="medline">23964710</pub-id>
          <pub-id pub-id-type="pii">1471-2288-13-105</pub-id>
          <pub-id pub-id-type="pmcid">PMC3765394</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosenbloom</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lorenzi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Stead</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>KB</given-names>
            </name>
          </person-group>
          <article-title>Data from clinical notes: a perspective on the tension between structure and flexible documentation</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2011</year>
          <month>03</month>
          <day>01</day>
          <volume>18</volume>
          <issue>2</issue>
          <fpage>181</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1136/jamia.2010.007237</pub-id>
          <pub-id pub-id-type="medline">21233086</pub-id>
          <pub-id pub-id-type="pii">jamia.2010.007237</pub-id>
          <pub-id pub-id-type="pmcid">PMC3116264</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>Ö</given-names>
            </name>
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lenert</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Advancing the state of the art in automatic extraction of adverse drug events from narratives</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>01</month>
          <day>01</day>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>2</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31841150"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz206</pub-id>
          <pub-id pub-id-type="medline">31841150</pub-id>
          <pub-id pub-id-type="pii">5678873</pub-id>
          <pub-id pub-id-type="pmcid">PMC6913224</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Hirschman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>D'Avolio</surname>
              <given-names>LW</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Overcoming barriers to NLP for clinical text: the role of shared tasks and the need for additional creative solutions</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2011</year>
          <month>10</month>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>540</fpage>
          <lpage>3</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21846785"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000465</pub-id>
          <pub-id pub-id-type="medline">21846785</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2011-000465</pub-id>
          <pub-id pub-id-type="pmcid">PMC3168329</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chowdhury</surname>
              <given-names>GG</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing</article-title>
          <source>Ann. Rev. Info. Sci. Tech</source>
          <year>2005</year>
          <month>01</month>
          <day>31</day>
          <volume>37</volume>
          <issue>1</issue>
          <fpage>51</fpage>
          <lpage>89</lpage>
          <pub-id pub-id-type="doi">10.1002/aris.1440370103</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moon</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mehrabi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Clinical information extraction applications: A literature review</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>01</month>
          <volume>77</volume>
          <fpage>34</fpage>
          <lpage>49</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(17)30256-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2017.11.011</pub-id>
          <pub-id pub-id-type="medline">29162496</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(17)30256-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC5771858</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kotfila</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>Ö</given-names>
            </name>
          </person-group>
          <article-title>Identifying risk factors for heart disease over time: Overview of 2014 i2b2/UTHealth shared task Track 2</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <volume>58 Suppl</volume>
          <fpage>S67</fpage>
          <lpage>77</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00140-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.07.001</pub-id>
          <pub-id pub-id-type="medline">26210362</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00140-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC4978189</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Styler</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Bethard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Finan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Palmer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pradhan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>de Groen</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Erickson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pustejovsky</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Temporal Annotation in the Clinical Domain</article-title>
          <source>Trans Assoc Comput Linguist</source>
          <year>2014</year>
          <month>04</month>
          <volume>2</volume>
          <fpage>143</fpage>
          <lpage>154</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29082229"/>
          </comment>
          <pub-id pub-id-type="medline">29082229</pub-id>
          <pub-id pub-id-type="pmcid">PMC5657277</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mojarad</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Standardizing Heterogeneous Annotation Corpora Using HL7 FHIR for Facilitating their Reuse and Integration in Clinical NLP</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2018</year>
          <volume>2018</volume>
          <fpage>574</fpage>
          <lpage>583</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30815098"/>
          </comment>
          <pub-id pub-id-type="medline">30815098</pub-id>
          <pub-id pub-id-type="pmcid">PMC6371380</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cardell-Oliver</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Redcoat: A Collaborative Annotation Tool for Hierarchical Entity Typing</article-title>
          <source>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP): System Demonstrations</source>
          <year>2019</year>
          <conf-name>2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP): System Demonstrations</conf-name>
          <conf-date>November 3-7, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>193</fpage>
          <lpage>198</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d19-3033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nye</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Marshall</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Nenkova</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A Corpus with Multi-Level Annotations of Patients, Interventions and Outcomes to Support Language Processing for Medical Literature</article-title>
          <source>Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>
          <year>2018</year>
          <conf-name>56th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>July 15-20, 2018</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/p18-1019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Campillos</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Deléger</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Grouin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hamon</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ligozat</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Névéol</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A French clinical corpus with comprehensive semantic annotations: development of the Medical Entity and Relation LIMSI annOtated Text corpus (MERLOT)</article-title>
          <source>Lang Resources &#38; Evaluation</source>
          <year>2017</year>
          <month>2</month>
          <day>15</day>
          <volume>52</volume>
          <issue>2</issue>
          <fpage>571</fpage>
          <lpage>601</lpage>
          <pub-id pub-id-type="doi">10.1007/s10579-017-9382-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Curtis</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Boudreau</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>WO</given-names>
            </name>
            <name name-style="western">
              <surname>Daniel</surname>
              <given-names>GW</given-names>
            </name>
            <name name-style="western">
              <surname>Nair</surname>
              <given-names>VP</given-names>
            </name>
            <name name-style="western">
              <surname>Raebel</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Beaulieu</surname>
              <given-names>NU</given-names>
            </name>
            <name name-style="western">
              <surname>Rosofsky</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Woodworth</surname>
              <given-names>TS</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Design considerations, architecture, and use of the Mini-Sentinel distributed data system</article-title>
          <source>Pharmacoepidemiol Drug Saf</source>
          <year>2012</year>
          <month>01</month>
          <volume>21 Suppl 1</volume>
          <fpage>23</fpage>
          <lpage>31</lpage>
          <pub-id pub-id-type="doi">10.1002/pds.2336</pub-id>
          <pub-id pub-id-type="medline">22262590</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garza</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Del Fiol</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Tenenbaum</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Walden</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zozus</surname>
              <given-names>MN</given-names>
            </name>
          </person-group>
          <article-title>Evaluating common data models for use with a longitudinal community registry</article-title>
          <source>J Biomed Inform</source>
          <year>2016</year>
          <month>12</month>
          <volume>64</volume>
          <fpage>333</fpage>
          <lpage>341</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(16)30153-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2016.10.016</pub-id>
          <pub-id pub-id-type="medline">27989817</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(16)30153-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC6810649</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanauer</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Mei</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Law</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Khanna</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Supporting information retrieval from electronic health records: A report of University of Michigan's nine-year experience in developing and using the Electronic Medical Record Search Engine (EMERSE)</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>06</month>
          <volume>55</volume>
          <fpage>290</fpage>
          <lpage>300</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00082-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.05.003</pub-id>
          <pub-id pub-id-type="medline">25979153</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00082-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC4527540</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Toti</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Morley</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ibrahim</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Folarin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kartoglu</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stringer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gale</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gorrell</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Broadbent</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dobson</surname>
              <given-names>RJB</given-names>
            </name>
          </person-group>
          <article-title>SemEHR: A general-purpose semantic search system to surface semantic data from clinical notes for tailored care, trial recruitment, and clinical research</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>05</month>
          <day>01</day>
          <volume>25</volume>
          <issue>5</issue>
          <fpage>530</fpage>
          <lpage>537</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29361077"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx160</pub-id>
          <pub-id pub-id-type="medline">29361077</pub-id>
          <pub-id pub-id-type="pii">4817428</pub-id>
          <pub-id pub-id-type="pmcid">PMC6019046</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Masanz</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ogren</surname>
              <given-names>PV</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kipper-Schuler</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>507</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20819853"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id>
          <pub-id pub-id-type="medline">20819853</pub-id>
          <pub-id pub-id-type="pii">17/5/507</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995668</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soysal</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>CLAMP - a toolkit for efficiently building customized clinical natural language processing pipelines</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>03</month>
          <day>01</day>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>331</fpage>
          <lpage>336</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29186491"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx132</pub-id>
          <pub-id pub-id-type="medline">29186491</pub-id>
          <pub-id pub-id-type="pii">4657212</pub-id>
          <pub-id pub-id-type="pmcid">PMC7378877</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Bedrick</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hersh</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Implementation of a Cohort Retrieval System for Clinical Data Repositories Using the Observational Medical Outcomes Partnership Common Data Model: Proof-of-Concept System Validation</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>10</month>
          <day>06</day>
          <volume>8</volume>
          <issue>10</issue>
          <fpage>e17376</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/10/e17376/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17376</pub-id>
          <pub-id pub-id-type="medline">33021486</pub-id>
          <pub-id pub-id-type="pii">v8i10e17376</pub-id>
          <pub-id pub-id-type="pmcid">PMC7576539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Vatani</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rasmussen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pathak</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Developing a portable natural language processing based phenotyping system</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2019</year>
          <month>04</month>
          <day>04</day>
          <volume>19</volume>
          <issue>Suppl 3</issue>
          <fpage>78</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-019-0786-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-019-0786-z</pub-id>
          <pub-id pub-id-type="medline">30943974</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-019-0786-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC6448187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Vydiswaran</surname>
              <given-names>VGV</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <collab>Uzuner</collab>
            <name name-style="western">
              <surname>Gururaj</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Bayer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aberdeen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rumshisky</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Ease of adoption of clinical natural language processing software: An evaluation of five systems</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <volume>58 Suppl</volume>
          <fpage>S189</fpage>
          <lpage>96</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00148-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.07.008</pub-id>
          <pub-id pub-id-type="medline">26210361</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00148-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC4974203</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>ABMI / SOCRATex</article-title>
          <source>GitHub</source>
          <access-date>2021-03-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/ABMI/SOCRATex">https://github.com/ABMI/SOCRATex</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hripcsak</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Shang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Peissig</surname>
              <given-names>PL</given-names>
            </name>
            <name name-style="western">
              <surname>Rasmussen</surname>
              <given-names>LV</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Benoit</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Carrell</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Dikilitas</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Gainer</surname>
              <given-names>VS</given-names>
            </name>
            <name name-style="western">
              <surname>Howell</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Klann</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Kullo</surname>
              <given-names>IJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lingren</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mentch</surname>
              <given-names>FD</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pacheco</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wiley</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Facilitating phenotype transfer using a common data model</article-title>
          <source>J Biomed Inform</source>
          <year>2019</year>
          <month>08</month>
          <volume>96</volume>
          <fpage>103253</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(19)30172-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2019.103253</pub-id>
          <pub-id pub-id-type="medline">31325501</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(19)30172-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC6697565</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reich</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Stang</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Rocca</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of alternative standardized terminologies for medical conditions within a network of observational healthcare databases</article-title>
          <source>J Biomed Inform</source>
          <year>2012</year>
          <month>08</month>
          <volume>45</volume>
          <issue>4</issue>
          <fpage>689</fpage>
          <lpage>96</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(12)00069-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2012.05.002</pub-id>
          <pub-id pub-id-type="medline">22683994</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(12)00069-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>MI</given-names>
            </name>
          </person-group>
          <article-title>Latent dirichlet allocation</article-title>
          <source>Journal of Machine Learning Research</source>
          <year>2003</year>
          <volume>3</volume>
          <fpage>993</fpage>
          <lpage>1022</lpage>
          <pub-id pub-id-type="doi">10.5555/944919.944937</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jelodar</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Latent Dirichlet allocation (LDA) and topic modeling: models, applications, a survey</article-title>
          <source>Multimed Tools Appl</source>
          <year>2018</year>
          <month>11</month>
          <day>28</day>
          <volume>78</volume>
          <issue>11</issue>
          <fpage>15169</fpage>
          <lpage>15211</lpage>
          <pub-id pub-id-type="doi">10.1007/s11042-018-6894-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Steyvers</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Finding scientific topics</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2004</year>
          <month>04</month>
          <day>06</day>
          <volume>101 Suppl 1</volume>
          <fpage>5228</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.pnas.org/cgi/pmidlookup?view=long&#38;pmid=14872004"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.0307752101</pub-id>
          <pub-id pub-id-type="medline">14872004</pub-id>
          <pub-id pub-id-type="pii">0307752101</pub-id>
          <pub-id pub-id-type="pmcid">PMC387300</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A density-based method for adaptive LDA model selection</article-title>
          <source>Neurocomputing</source>
          <year>2009</year>
          <month>3</month>
          <volume>72</volume>
          <issue>7-9</issue>
          <fpage>1775</fpage>
          <lpage>1781</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2008.06.011</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arun</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Suresh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Madhavan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Murthy</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>On Finding the Natural Number of Topics with Latent Dirichlet Allocation: Some Observations</article-title>
          <source>PAKDD 2010: Advances in Knowledge Discovery and Data Mining</source>
          <year>2010</year>
          <conf-name>Pacific-Asia Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>July 21-24, 2010</conf-date>
          <conf-loc>Hyderabad, India</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-642-13657-3_43</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deveaud</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>SanJuan</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bellot</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Accurate and effective latent concept modeling for ad hoc information retrieval</article-title>
          <source>Document numérique</source>
          <year>2014</year>
          <month>04</month>
          <day>30</day>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>61</fpage>
          <lpage>84</lpage>
          <pub-id pub-id-type="doi">10.3166/dn.17.1.61-84</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd-Graber</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gerrish</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Reading tea leaves: how humans interpret topic models</article-title>
          <source>Proceedings of the 22nd International Conference on Neural Information Processing Systems</source>
          <year>2009</year>
          <conf-name>22nd International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 2009</conf-date>
          <conf-loc>Vancouver</conf-loc>
          <fpage>288</fpage>
          <lpage>296</lpage>
          <pub-id pub-id-type="doi">10.5555/2984093.2984126</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pezoa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Reutter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Suarez</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ugarte</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vrgoč</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Foundations of JSON Schema</article-title>
          <source>Proceedings of the 25th International Conference on World Wide Web</source>
          <year>2016</year>
          <conf-name>25th International Conference on World Wide Web</conf-name>
          <conf-date>April 2016</conf-date>
          <conf-loc>Montréal</conf-loc>
          <fpage>263</fpage>
          <lpage>273</lpage>
          <pub-id pub-id-type="doi">10.1145/2872427.2883029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kononenko</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Baysal</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Godfrey</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Mining modern repositories with elasticsearch</article-title>
          <source>Proceedings of the 11th Working Conference on Mining Software Repositories</source>
          <year>2014</year>
          <conf-name>11th Working Conference on Mining Software Repositories</conf-name>
          <conf-date>May 2014</conf-date>
          <conf-loc>Hyderabad, India</conf-loc>
          <fpage>328</fpage>
          <lpage>331</lpage>
          <pub-id pub-id-type="doi">10.1145/2597073.2597091</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ahn</surname>
              <given-names>EK</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>RW</given-names>
            </name>
          </person-group>
          <article-title>Conversion and Data Quality Assessment of Electronic Health Record Data at a Korean Tertiary Teaching Hospital to a Common Data Model for Distributed Network Research</article-title>
          <source>Healthc Inform Res</source>
          <year>2016</year>
          <month>01</month>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>54</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.e-hir.org/DOIx.php?id=10.4258/hir.2016.22.1.54"/>
          </comment>
          <pub-id pub-id-type="doi">10.4258/hir.2016.22.1.54</pub-id>
          <pub-id pub-id-type="medline">26893951</pub-id>
          <pub-id pub-id-type="pmcid">PMC4756059</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Srigley</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>McGowan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Maclean</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Raby</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kramer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sawka</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Standardized synoptic cancer pathology reporting: a population-based approach</article-title>
          <source>J Surg Oncol</source>
          <year>2009</year>
          <month>06</month>
          <day>15</day>
          <volume>99</volume>
          <issue>8</issue>
          <fpage>517</fpage>
          <lpage>24</lpage>
          <pub-id pub-id-type="doi">10.1002/jso.21282</pub-id>
          <pub-id pub-id-type="medline">19466743</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Voigt</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>von dem Bussche</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Practical Implementation of the Requirements Under the GDPR</article-title>
          <source>The EU General Data Protection Regulation (GDPR)</source>
          <year>2017</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>245</fpage>
          <lpage>249</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Centers for Disease ControlPrevention (CDC)</collab>
          </person-group>
          <article-title>HIPAA privacy rule and public health. Guidance from CDC and the U.S. Department of Health and Human Services</article-title>
          <source>MMWR Suppl</source>
          <year>2003</year>
          <month>05</month>
          <day>02</day>
          <volume>52</volume>
          <fpage>1</fpage>
          <lpage>17, 19</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cdc.gov/mmwr/preview/mmwrhtml/su5201a1.htm"/>
          </comment>
          <pub-id pub-id-type="medline">12741579</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>YR</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JH</given-names>
            </name>
          </person-group>
          <article-title>A De-identification method for bilingual clinical texts of various note types</article-title>
          <source>J Korean Med Sci</source>
          <year>2015</year>
          <month>01</month>
          <volume>30</volume>
          <issue>1</issue>
          <fpage>7</fpage>
          <lpage>15</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jkms.org/DOIx.php?id=10.3346/jkms.2015.30.1.7"/>
          </comment>
          <pub-id pub-id-type="doi">10.3346/jkms.2015.30.1.7</pub-id>
          <pub-id pub-id-type="medline">25552878</pub-id>
          <pub-id pub-id-type="pmcid">PMC4278030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>American Psychiatric Association</collab>
          </person-group>
          <source>Diagnostic and statistical manual of mental disorders (DSM-5®)</source>
          <year>2013</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>American Psychiatric Association Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Baek</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ha</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>YH</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>HK</given-names>
            </name>
            <name name-style="western">
              <surname>Moon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Na</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kwak</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CY</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>JY</given-names>
            </name>
            <collab>Korean Society of Thyroid Radiology (KSThR)Korean Society of Radiology</collab>
          </person-group>
          <article-title>Ultrasonography Diagnosis and Imaging-Based Management of Thyroid Nodules: Revised Korean Society of Thyroid Radiology Consensus Statement and Recommendations</article-title>
          <source>Korean J Radiol</source>
          <year>2016</year>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>370</fpage>
          <lpage>95</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.kjronline.org/DOIx.php?id=10.3348/kjr.2016.17.3.370"/>
          </comment>
          <pub-id pub-id-type="doi">10.3348/kjr.2016.17.3.370</pub-id>
          <pub-id pub-id-type="medline">27134526</pub-id>
          <pub-id pub-id-type="pmcid">PMC4842857</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>TW</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Prognostic Significance of Lymphovascular Invasion in Sporadic Colorectal Cancer</article-title>
          <source>Diseases of the Colon &#38; Rectum</source>
          <year>2010</year>
          <volume>53</volume>
          <issue>4</issue>
          <fpage>377</fpage>
          <lpage>384</lpage>
          <pub-id pub-id-type="doi">10.1007/dcr.0b013e3181cf8ae5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lykke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Roikjaer</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Jess</surname>
              <given-names>P</given-names>
            </name>
            <collab>Danish Colorectal Cancer Group</collab>
          </person-group>
          <article-title>The relation between lymph node status and survival in Stage I-III colon cancer: results from a prospective nationwide cohort study</article-title>
          <source>Colorectal Dis</source>
          <year>2013</year>
          <month>05</month>
          <day>25</day>
          <volume>15</volume>
          <issue>5</issue>
          <fpage>559</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.1111/codi.12059</pub-id>
          <pub-id pub-id-type="medline">23061638</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</source>
          <year>2019</year>
          <conf-name>2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>4171</fpage>
          <lpage>4186</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Carbonell</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Salakhutdinov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>XLNet: Generalized Autoregressive Pretraining for Language Understanding</article-title>
          <source>Advances in Neural Information Processing Systems 32 (NeurIPS 2019)</source>
          <year>2019</year>
          <conf-name>2019 Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 8-14, 2019</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shyam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Herbert-Voss</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Krueger</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Henighan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Child</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ziegler</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Winter</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hesse</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sigler</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Litwin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chess</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Berner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>McCandlish</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Amodei</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Language Models are Few-Shot Learners</article-title>
          <source>Advances in Neural Information Processing Systems 33 (NeurIPS 2020)</source>
          <year>2020</year>
          <conf-name>2020 Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 6-12, 2020</conf-date>
          <conf-loc>Virtual</conf-loc>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
