<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v12i1e59680</article-id>
      <article-id pub-id-type="pmid">38954456</article-id>
      <article-id pub-id-type="doi">10.2196/59680</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Is Boundary Annotation Necessary? Evaluating Boundary-Free Approaches to Improve Clinical Named Entity Annotation Efficiency: Case Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gaudet-Blavignac</surname>
            <given-names>Christophe</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Raithel</surname>
            <given-names>Lisa</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Herman Bernardim Andrade</surname>
            <given-names>Gabriel</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1950-8069</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Yada</surname>
            <given-names>Shuntaro</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6209-1054</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Aramaki</surname>
            <given-names>Eiji</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Graduate School of Science and Technology</institution>
            <institution>Nara Institute of Science and Technology</institution>
            <addr-line>8916-5, Takayama-cho</addr-line>
            <addr-line>Ikoma, 630-0192</addr-line>
            <country>Japan</country>
            <phone>81 743 72 5250</phone>
            <email>aramaki@is.naist.jp</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0201-3609</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Graduate School of Science and Technology</institution>
        <institution>Nara Institute of Science and Technology</institution>
        <addr-line>Ikoma</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Eiji Aramaki <email>aramaki@is.naist.jp</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>7</month>
        <year>2024</year>
      </pub-date>
      <volume>12</volume>
      <elocation-id>e59680</elocation-id>
      <history>
        <date date-type="received">
          <day>19</day>
          <month>4</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>9</day>
          <month>5</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>23</day>
          <month>5</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>5</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Gabriel Herman Bernardim Andrade, Shuntaro Yada, Eiji Aramaki. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 02.07.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2024/1/e59680" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Named entity recognition (NER) is a fundamental task in natural language processing. However, it is typically preceded by named entity annotation, which poses several challenges, especially in the clinical domain. For instance, determining entity boundaries is one of the most common sources of disagreements between annotators due to questions such as whether modifiers or peripheral words should be annotated. If unresolved, these can induce inconsistency in the produced corpora, yet, on the other hand, strict guidelines or adjudication sessions can further prolong an already slow and convoluted process.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this study is to address these challenges by evaluating 2 novel annotation methodologies, <italic>lenient span</italic> and <italic>point annotation</italic>, aiming to mitigate the difficulty of precisely determining entity boundaries.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We evaluate their effects through an annotation case study on a Japanese medical case report data set. We compare annotation time, annotator agreement, and the quality of the produced labeling and assess the impact on the performance of an NER system trained on the annotated corpus.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We saw significant improvements in the labeling process efficiency, with up to a 25% reduction in overall annotation time and even a 10% improvement in annotator agreement compared to the traditional boundary-strict approach. However, even the best-achieved NER model presented some drop in performance compared to the traditional annotation methodology.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our findings demonstrate a balance between annotation speed and model performance. Although disregarding boundary information affects model performance to some extent, this is counterbalanced by significant reductions in the annotator’s workload and notable improvements in the speed of the annotation process. These benefits may prove valuable in various applications, offering an attractive compromise for developers and researchers.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>named entity recognition</kwd>
        <kwd>information extraction</kwd>
        <kwd>text annotation</kwd>
        <kwd>entity boundaries</kwd>
        <kwd>lenient annotation</kwd>
        <kwd>case reports</kwd>
        <kwd>annotation</kwd>
        <kwd>case study</kwd>
        <kwd>medical case report</kwd>
        <kwd>efficiency</kwd>
        <kwd>model</kwd>
        <kwd>model performance</kwd>
        <kwd>dataset</kwd>
        <kwd>Japan</kwd>
        <kwd>Japanese</kwd>
        <kwd>entity</kwd>
        <kwd>clinical domain</kwd>
        <kwd>clinical</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Overview</title>
        <p>The electronic health record (EHR) can be an important source of data for health-related research as it contains information on a patient’s condition and complaints, performed procedures and administered drugs, the outcome of the treatment, and more [<xref ref-type="bibr" rid="ref1">1</xref>].</p>
        <p>Clinical narratives are a fundamental part of EHRs. Due to their free and unstructured format, natural language processing (NLP) methods are essential for extracting the information from such documents in a way that is comprehensible and useful for computer systems. Although machine learning–based NLP systems can achieve high performance, these often require large amounts of in-domain annotated data for proper training [<xref ref-type="bibr" rid="ref2">2</xref>]. Recent few-shot approaches empowered by large language models (LLMs) have also been shown to be performant. Yet, these can also benefit from fine-tuning with in-domain examples, yielding notable improvements [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
        <p>Named entity (NE) annotation, as an inherently manual process, allied to the sheer volume of data that must be meticulously labeled to produce an accurate model, makes it an exhausting and time-consuming task [<xref ref-type="bibr" rid="ref4">4</xref>]. Particularly when annotating clinical data, workers must possess not only linguistic understanding but specialized medical knowledge is also required. Recruiting such a capable workforce can make the process rather costly [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>Furthermore, annotation is accompanied by a set of practical issues. For instance, it is natural that contributors disagree on how certain information is annotated or even whether it should be annotated [<xref ref-type="bibr" rid="ref6">6</xref>]. Determining entity boundaries, meaning where a concept starts and ends, is one of the primary sources of conflict during the process, as so-called <italic>boundary words</italic>, such as articles or adjectives, can induce ambiguity [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>Especially in medical texts, it is common for annotators to be unsure whether adjectives or modifiers should be included in the annotation. For example, in the sentence presented in <xref rid="figure1" ref-type="fig">Figure 1</xref>, some may annotate only the core symptom (“inflammation”).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Example of different annotation paradigms. Traditional annotation (a) requires precisely labeling the beginning and end of the span, while boundary-free (b and c) methods focus on only identifying the core term.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e59680_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Conversely, others may consider adding all modifiers necessary for a complete encapsulation of the condition.</p>
        <p>While entity boundary definition is a problem that affects all languages, scriptio continua languages (which do not have spaces between words), such as Japanese, Chinese, and Korean, are particularly impacted due to the increased difficulty in separating concepts and modifiers.</p>
        <p>One can employ strict annotation guidelines to delineate precisely how information should be annotated and even implement adjudication sessions to resolve disagreements. Yet, these can increase the workload and complexity of an already slow and convoluted process.</p>
        <p>As an alternative to mitigate such issues, we propose to reformulate the annotation task by eliminating the need to define specific span boundaries when annotating an NE. By demanding less precision from the annotators, we expect to minimize the required decision-making during labeling, thus, improving annotation speed and relieving conflicts.</p>
        <p>Although this approach may reduce annotation quality, named entity recognition (NER) performance should not be significantly impacted, as previous research found that models are resilient to a certain amount of boundary imprecision in their training data [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>In this paper, we leverage this phenomenon by introducing 2 <italic>boundary-free</italic> annotation methodologies: <italic>lenient span</italic>, which relieves the emphasis on entity boundary precision, and <italic>point</italic>, which uses a single position to represent the annotation. <xref rid="figure1" ref-type="fig">Figure 1</xref> presents a visual comparison between the methodologies. We performed a case study to evaluate the efficiency of the proposed methods when annotating a corpus of Japanese medical case reports to create training data for an NER system.</p>
        <p>Our contributions are summarized as follows. We present 2 novel boundary-free annotation methodologies, evaluate the efficiency of the annotation process by metrics of annotation time and annotator agreement, and analyze the impact on the performance of an NER system trained with annotated corpora.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <sec>
          <title>Annotation Efficiency Improvements</title>
          <p>Attempts to improve the annotation workflow are a common theme in NER-related research.</p>
          <p>Preannotation depicts the automatic labeling of the text prior to the annotator work [<xref ref-type="bibr" rid="ref9">9</xref>]. This technique can not only reduce the required annotation time and workload required but also minimize errors [<xref ref-type="bibr" rid="ref10">10</xref>]. Active learning (AL) [<xref ref-type="bibr" rid="ref11">11</xref>] can further optimize automatic labeling by iteratively incorporating the data produced during the annotation process to retrain the preannotation model. Kholghi et al [<xref ref-type="bibr" rid="ref12">12</xref>] ascertained that AL reduced the annotation time by up to 35% (5.6/16 hours) during experiments.</p>
          <p>While these are well-established approaches, recent studies also explore alternative ideas. Tokunaga et al [<xref ref-type="bibr" rid="ref13">13</xref>] analyzed eye-tracking data during NE annotation to identify characteristics that can help design effective features for an annotation tool. Saxena et al [<xref ref-type="bibr" rid="ref14">14</xref>] introduced a hybrid search-enhanced software that allows users to look for similar terms and annotate related information simultaneously, shortening work time when compared to standard tools.</p>
          <p>In recent years, generative LLMs have transformed NLP research and applications, becoming state-of-the-art NLP techniques. While the potential of LLMs to improve the text annotation workflow has also been evaluated in a few different studies [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>], Tan et al [<xref ref-type="bibr" rid="ref18">18</xref>] point out that their effectiveness is still strongly affected by model hallucinations and the gap in performance between proprietary and open-source LLMs.</p>
          <p>Although crowdsourcing platforms allow the convenient annotation of vast amounts of data [<xref ref-type="bibr" rid="ref19">19</xref>], they do not improve task execution or reduce the workload of an individual worker. In addition, as Snow et al [<xref ref-type="bibr" rid="ref20">20</xref>] noted, inconsistent or low-quality annotations require effective quality control measures. Li [<xref ref-type="bibr" rid="ref21">21</xref>] found that LLMs can be used to improve the quality of annotation generated by crowdsourcing. Yet LLM annotation quality is still shy of what can be produced manually; thus, combining the automated technique and human effort is still the best approach to creating a high-quality data set [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        </sec>
        <sec>
          <title>Entity Boundary Imprecision</title>
          <p>When addressing boundary imprecision, most studies regard it as a form of noise that should be corrected or circumvented. For instance, Liu et al [<xref ref-type="bibr" rid="ref23">23</xref>] use confidence scores and normalization techniques based on the labeling structure to estimate the correct span.</p>
          <p>Zhu and Li [<xref ref-type="bibr" rid="ref7">7</xref>] introduced a boundary regularization technique, redistributing a portion of the probability assigned to an annotated span to its neighboring words. This process produces a smooth transition between entity annotations and their nonentity surroundings, mitigating annotation boundary inconsistencies.</p>
          <p>Shen et al [<xref ref-type="bibr" rid="ref24">24</xref>] propose the NER task as a boundary-denoising diffusion process, where a model is trained to derive precise NEs from noisy spans. The authors added controlled noise to gold entity boundaries and used the imprecise data to teach a model to apply a reverse diffusion process to recover the original entity boundaries.</p>
          <p>On the other hand, Andrade et al [<xref ref-type="bibr" rid="ref8">8</xref>] identified that imprecise boundary annotation may not have an extensive impact in some applications. The authors evaluated the effect of various levels of imprecise boundary annotation on NER and entity linking. They identified that models are resilient to a certain amount of noise, showing a small performance drop in that range.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Set</title>
        <p>We used the MedTxt-CR-JA corpus [<xref ref-type="bibr" rid="ref25">25</xref>] in our experiments. This data set comprises 148 open-access case reports in Japanese. <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> presents an example document from the data set.</p>
        <p>A case report is a detailed description of a patient’s medical condition, containing, among other information, the temporal progression of the disease and its treatment. Its format is similar to a discharge summary and is frequently used in medical NLP, such as in MIMIC-III [<xref ref-type="bibr" rid="ref26">26</xref>] or n2c2 shared tasks [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
        <p>This corpus was used in previous studies [<xref ref-type="bibr" rid="ref28">28</xref>] and contains pre-existing annotations for diseases and symptom names, drugs, anatomical parts, etc. Although we discarded these labels for our experiments, we use them as a gold standard (GS) for evaluation purposes. From now on, this set of annotations is identified as the <italic>gold standard corpus</italic> (GSC).</p>
        <boxed-text id="box1" position="float">
          <title>Example of a case report from MedTxt-CR-JA and its English translation.</title>
          <p>
            <bold>Original:</bold>
          </p>
          <p>５８歳，女性．</p>
          <p>初診の約２週間前より皮疹が出現，増悪してきたため来院した．</p>
          <p>初診時，体幹・四肢に広範囲に浮腫性紅斑が出現し，一部では小水疱を形成していた．</p>
          <p>手指背では関節部に一致して角化性紅斑を認め，爪囲には紅斑・紫斑を，眼周囲には軽度の紫紅色斑を認めた．</p>
          <p>この時点ではＣＰＫ，ＬＤＨの軽度上昇，抗核抗体２０倍以外，特に異常はなく，確診に至らないため，ステロイド軟膏外用にて経過観察していたところ，３週目頃より体幹・四肢の皮疹が角化性赤色斑へと変化し，１か月目頃より上眼瞼の浮腫性紅斑が著明となり，典型疹となった．</p>
          <p>肺癌の合併により発症１年２か月後に死亡した．</p>
          <p>臨床経過から，初診時にみられた多形紅斑様あるいは湿疹様の皮疹を皮膚筋炎の早期皮疹と考えた</p>
          <p>
            <bold>English translation:</bold>
          </p>
          <p>A 58-year-old female.</p>
          <p>The patient visited this hospital due to the appearance of a skin rash which worsened about 2 weeks before her first visit.</p>
          <p>At the initial examination, the patient had extensive edematous erythema on her torso and extremities, with forming blisters.</p>
          <p>Keratinized erythema was uniformly observed around the joints on the back side of the fingers, erythema and purpura were observed around the nails, and mild purplish-red spots were observed around the eyes.</p>
          <p>At this point, there were no abnormalities other than mildly elevated CPK and LDH and 20-fold increase in antinuclear antibodies.</p>
          <p>Consequently, follow-up with a topical steroid ointment was carried out.</p>
          <p>However, by the third week, the skin rash on the torso and extremities changed to keratotic red plaques, and edematous erythema of the upper eyelids became prominent by approximately the first month and became a typical rash.</p>
          <p>The patient died 1 year and 2 months after the onset of illness due to complications of lung cancer.</p>
          <p>Based on the clinical history, the erythema multiforme or eczema-like skin rash seen at the time of the initial examination is considered to be an early-stage skin rash of dermatomyositis.</p>
        </boxed-text>
        <p>We randomly selected a subset of 100 documents from the full corpus, referred to from now on as the <italic>data set</italic>. To minimize the difference in difficulty between texts, we selected documents with similar lengths and quantity of GS entities. Texts are, on average, 554 characters long, roughly equivalent to 250-300 English words, containing around 10 entities per text.</p>
        <p>Even though the set of documents for annotation may be considered small, it is worth noting that a scenario with such a small amount of data is not uncommon in the clinical setting, where strong data restrictions usually limit the amount of data available to work with [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
      </sec>
      <sec>
        <title>Annotation Guidelines</title>
        <p>It is common to define a set of guidelines before an annotation process to minimize the divergences between annotators and guarantee consistency.</p>
        <p>We followed the annotation schema as defined by Yada et al [<xref ref-type="bibr" rid="ref30">30</xref>]. To simplify the evaluation process, annotators were asked to label only positive (nonnegated) entities of the “Diseases and symptoms” category. We provided the participants with a document describing what should be annotated and some examples, as summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Annotation guidelines.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="470"/>
            <col width="500"/>
            <thead>
              <tr valign="bottom">
                <td colspan="2">Description</td>
                <td>Examples<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">
                  <bold>What to annotate</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Reported symptoms, disease names, and clinical findings (pathology, CT<sup>b</sup>, and other images)</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Patient visited this hospital due to the appearance of a <italic>skin rash</italic>.</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clinical suspicion, even if there is a slight possibility of disease occurrence</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p><italic>Epicarditis</italic> was <italic>suspected</italic> and the patient was hospitalized on July 2.</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>The locus of a condition, such as an anatomical structure or location, body substance, or physiologic function</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Abdominal CT scan revealed <italic>many enlarged intra-abdominal lymph nodes</italic>.</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Adjectives and other modifier words that alter the characteristics or intensity of a condition</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Patient had no subjective symptoms other than a <italic>high fever</italic>.</p>
                    </list-item>
                    <list-item>
                      <p>There was <italic>spotty necrosis</italic> in the lobules.</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>What should not be annotated</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Absence of symptoms or diseases. Basically, a negation of a clinical concept</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Abdominal findings were unremarkable.</p>
                    </list-item>
                    <list-item>
                      <p>The rash disappeared in about 2 months.</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>General discussion of a condition merely as a reference and not as a clinical finding</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>There is a possibility of primary biliary cholangitis when elevated hepatobiliary enzymes are detected.</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Numeric or qualitative findings of an investigation, such as laboratory test values</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The measured blood pressure was abnormal.</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>In the examples, entities that should be annotated are marked in italics.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>CT: computed tomography.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Annotation Methodologies</title>
        <p>Our goal is to evaluate whether relieving the emphasis on entity boundary improves annotation speed while maintaining the overall quality of the produced labels. Thus, we compared the <italic>traditional</italic> (boundary-strict) annotation method against 2 proposed boundary-free approaches: <italic>lenient span</italic> and <italic>point annotation</italic>. <xref rid="figure1" ref-type="fig">Figure 1</xref> presents a comparative example of each annotation method.</p>
        <sec>
          <title>Traditional Annotation</title>
          <p>Traditional annotation requires precise annotation of each NE’s exact start and end positions.</p>
        </sec>
        <sec>
          <title>Lenient Span Annotation</title>
          <p>Lenient span annotation introduces flexibility to the annotation boundaries. While the annotation is still composed of a span, start and end positions are not required to be exactly aligned with the NE boundaries.</p>
        </sec>
        <sec>
          <title>Point Annotation</title>
          <p>Unlike span-based paradigms, this method requires selecting a single point at any position within the NE span without explicitly specifying the span. It prioritizes speed and simplicity in scenarios where it is not straightforward to determine the NE span precisely. On the other hand, it may introduce ambiguity in the information captured by the annotation.</p>
        </sec>
      </sec>
      <sec>
        <title>Note on LLM Annotation</title>
        <p>While the use of generative LLMs for text annotation is gaining traction, in this work, we seek ways to aid human annotation and reduce the necessary effort as much as possible where LLMs cannot be used.</p>
        <p>The use of LLMs still raises concerns about privacy and security issues; as due to the necessary infrastructure and computational power needed, these models are usually held in the cloud and owned by third-party companies [<xref ref-type="bibr" rid="ref31">31</xref>]. Given the sensitive nature of clinical data, the usage of LLMs in NLP tasks on real-world data is usually constrained by the policy of medical institutions. Thus, there is still a need for manual annotations until performant medical LLMs can be accessed through a secure private network or hosted inside hospital facilities at a reasonable cost.</p>
      </sec>
      <sec>
        <title>Annotation Task</title>
        <p>We asked 4 annotators with medical background and different levels of annotation experience to participate in the experiments. They produced 3 annotated corpora by labeling the documents from the data set using each evaluated methodology. We measured the time taken for each annotation session and computed agreement metrics. We then used each produced corpus to fine-tune a Bidirectional Encoder Representations From Transformers (BERT)–based [<xref ref-type="bibr" rid="ref32">32</xref>] NER system and evaluated its performance to assess the corpora quality.</p>
      </sec>
      <sec>
        <title>Annotation Tool Development</title>
        <p>We developed a Java-based annotation tool to support the proposed boundary-free approaches [<xref ref-type="bibr" rid="ref33">33</xref>]. Annotations can be presented with smoothed edges using a gradient of color to represent a <italic>soft boundary</italic> and encourage the annotators to be less meticulous when marking the boundaries of the concept. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows a screenshot of the main annotation window.</p>
        <p>The text is displayed in its original style, keeping line breaks, spacing, and special characters. Since there is no pretokenization of the texts, annotators can select text spans with character-level precision.</p>
        <p>The tool has the following two modes to annotate a concept: (1) <italic>click and drag</italic> and (2) <italic>click-only.</italic></p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Screenshot of the annotation tool.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e59680_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Click and Drag</title>
          <p>The user clicks on the location where the concept begins and drags the mouse up to where it ends. After releasing the mouse, the area becomes highlighted, representing the labeling.</p>
        </sec>
        <sec>
          <title>Click-Only</title>
          <p>The user clicks on an entity to label it. While the annotation is stored as a single point, the position will be expanded to a <italic>simulated</italic> span on the interface, representing approximately the labeled concept, as shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
          <p>The annotators received instructions on how to use the tool and a video demonstrating the annotation of a document. They were also supplied with 10 test documents to familiarize themselves with the tool.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Example of a click-only annotation. The selected position, represented by the red circle, is expanded to the word boundaries (in green) plus a random span (orange arrows).</p>
            </caption>
            <graphic xlink:href="medinform_v12i1e59680_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>Labeling Workflow</title>
        <p>To minimize the number of times each annotator would annotate the same document yet allow us to have at least 2 sets of annotations for a given methodology, we divided our data set of 100 documents into 4 splits.</p>
        <p>For each annotation session, each participant received a file containing 2 splits and the annotation methodology that should be used (totaling 50 documents per annotator), as presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Data split for crossover experiment design.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="160"/>
            <col width="180"/>
            <col width="180"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td>Annotator or annotators</td>
                <td colspan="4">Documents</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1-25</td>
                <td>26-50</td>
                <td>51-75</td>
                <td>76-100</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>A</td>
                <td>P<sup>a</sup>/T<sup>b</sup></td>
                <td>P</td>
                <td>S<sup>c</sup>/T</td>
                <td>S</td>
              </tr>
              <tr valign="top">
                <td>B</td>
                <td>S/T</td>
                <td>S/T</td>
                <td>P</td>
                <td>P</td>
              </tr>
              <tr valign="top">
                <td>C</td>
                <td>S</td>
                <td>P/T</td>
                <td>P/T</td>
                <td>S/T</td>
              </tr>
              <tr valign="top">
                <td>D</td>
                <td>P</td>
                <td>S</td>
                <td>S</td>
                <td>P/T</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>P: point annotation.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>T: traditional annotation.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>S: lenient span annotation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>We attempted to maximize the mixing between the annotator and the methodology used.</p>
        <p>The work was executed in 3 different sessions, the first for point annotation, followed by the lenient span annotation, and lastly, the traditional annotation. During the first 2 sessions, the annotation tool was configured to show smooth edges, and annotators were instructed not to fix slightly incorrect annotations as long as the core concept was highlighted in the tool’s interface.</p>
        <p>Although the same annotator worked on the same document more than once, the traditional annotation (third) session was conducted 6 months later to avoid memory bias affecting the annotation time measurement. This time annotators were instructed to be as precise as possible when selecting the entity spans and not to refrain from undoing incorrect annotations. The annotation tool was configured beforehand to present the annotations with precise hard boundaries, as any other standard annotation software.</p>
        <p>Across all sessions, participants were instructed to annotate the broadest expression whenever in doubt about whether some words should be included in the annotation. Each session produced 2 parallel sets of annotations for each document, unified in a single corpus for each annotation method.</p>
        <p>We resolved all disagreements between the 2 sets automatically. We accepted all annotations made by either annotator, even if there is no matching counterpart. Whenever there is boundary disagreement, we choose the broadest span possible when combining the 2 annotations.</p>
        <p>For <italic>point</italic> annotations, we grouped annotations that refer to the same NE and averaged their positions. We consider annotations as referring to the same concept when located within 6 characters of distance from each other. The distance limit was chosen based on the average Japanese word length, around 3 characters. We chose a larger value to account for multiword concepts.</p>
      </sec>
      <sec>
        <title>Point-to-Span Estimation</title>
        <p>Being aware that the single-position label produced by the <italic>point</italic> annotation method may not convey enough information about the adequate range of the NE to be extracted when training the model, we developed a <italic>point-to-span</italic> estimation method [<xref ref-type="bibr" rid="ref34">34</xref>]. It can complement the annotation with span information without additional manual work.</p>
        <p>We used a BERT model (referred to as the <italic>expansion model</italic>) that receives the positional annotation and attempts to predict the original NE span. Effectively, it works as a method to convert Points into Span-based annotations, as illustrated in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
        <p>The <italic>point-to-span</italic> estimation model is based on the pretrained <italic>tohoku-nlp/bert-base-japanese-char-v2</italic> model [<xref ref-type="bibr" rid="ref35">35</xref>], and it was fine-tuned using the training parameters presented in <xref ref-type="table" rid="table3">Table 3</xref>. Training was performed on a server with 2 NVIDIA Quadro RTX 8000 GPUs.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Flow of the point-to-span estimation process. BERT: Bidirectional Encoder Representations From Transformers.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e59680_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Hyperparameters used for model training.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Parameter</td>
                <td>Value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Max epochs</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Training batch size</td>
                <td>16</td>
              </tr>
              <tr valign="top">
                <td>Learning rate</td>
                <td>3×10<sup>–5</sup></td>
              </tr>
              <tr valign="top">
                <td>Optimizer</td>
                <td>AdamW</td>
              </tr>
              <tr valign="top">
                <td>Max sentence length</td>
                <td>512 characters</td>
              </tr>
              <tr valign="top">
                <td>Model selection</td>
                <td>Early stopping</td>
              </tr>
              <tr valign="top">
                <td>Training time</td>
                <td>Approximately 30 min</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>As training data, we used a large data set of Japanese medical texts with labeled diseases and symptoms consisting of 1027 synthetic medication history notes generated through crowdsourcing. In total, 10 experienced dispensing pharmacists were hired as writers to craft the corpus. Each writer was assigned 1 of 285 drug names and tasked with creating a “typical” clinical narrative.</p>
        <p>Before being fed to the model, each annotation of the training data was replaced by an identifier token <inline-graphic xlink:href="medinform_v12i1e59680_fig9.png" xlink:type="simple" mimetype="image"/> in a random location within its span based on a truncated normal distribution. A different distribution was used for each annotation, centered on the middle point, with SD being a sixth of the annotation length. Due to the randomicity of the data, we augmented the data set 10 times by re-executing the annotation replacement module and generating different valid positions for the <inline-graphic xlink:href="medinform_v12i1e59680_fig9.png" xlink:type="simple" mimetype="image"/>.</p>
        <p>The expansion model was then trained to identify this token and output the start and end positions of the concept based on the word containing the token and its surrounding context.</p>
        <p>We evaluated the model by predicting the spans for annotations on the GSC. We preprocessed the GSC annotations using the same method to replace the annotations with <inline-graphic xlink:href="medinform_v12i1e59680_fig9.png" xlink:type="simple" mimetype="image"/>. tokens. Our best model was able to achieve an <italic>F</italic><sub>1</sub>-score of 0.77.</p>
        <p>We applied the expansion model to the point-annotated data set to infer spans for each annotation, producing a <italic>point-expanded</italic> corpus. Effectively, the combination of point annotation and expansion allows the generation of a span-annotated data set with less human effort.</p>
      </sec>
      <sec>
        <title>Evaluation</title>
        <sec>
          <title>Annotation Method Efficiency</title>
          <p>We evaluated the annotation methods according to the following:</p>
          <list list-type="bullet">
            <list-item>
              <p>Annotation quality: We assessed the percentage of GSC concepts that were correctly annotated. We consider an annotation correct when at least 1 token overlaps with the GS span.</p>
            </list-item>
            <list-item>
              <p>Annotation time: Annotators manually measured the time they took to work on the data during each session. They were instructed to start the timing after loading the texts in the annotation software.</p>
            </list-item>
            <list-item>
              <p>Interannotator agreement (IAA): We use Cohen Kappa [<xref ref-type="bibr" rid="ref36">36</xref>], one of the most common metrics for gauging agreement between annotators. Kappa is a function of the proportion of observed and expected agreement, and it may be interpreted as the proportion of agreement corrected for chance [<xref ref-type="bibr" rid="ref37">37</xref>].</p>
            </list-item>
          </list>
          <p>Given that the <italic>point</italic> annotation methodology allows for multiple correct annotations within the NE span, we computed an additional <italic>adjusted variant</italic> of the metrics specifically for these annotations. In this variant, we considered annotations to agree if they were within a 3-character range of each other, reflecting the average word length in the Japanese language.</p>
        </sec>
        <sec>
          <title>Downstream Task Performance</title>
          <p>As one of the typical downstream tasks, we developed an NER system to benchmark each annotation approach. We again employed the pretrained <italic>tohoku-nlp/bert-base-japanese-char-v2</italic> model [<xref ref-type="bibr" rid="ref35">35</xref>] and fine-tuned it using our annotated corpora.</p>
          <p>We used the same training parameters for all models, as presented in <xref ref-type="table" rid="table3">Table 3</xref>. To minimize the variability between results, we used 5-fold cross-validation and averaged the obtained values.</p>
          <p>We evaluated model predictions on the MedTxt-CR-JA test set, comprised of 75 documents, by the metrics of <italic>precision</italic>, <italic>recall</italic>, and <italic>F-score</italic>. We employ two variants of the metrics: (1) strict and (2) relaxed.</p>
          <p>Strict metrics follow CoNLL criteria [<xref ref-type="bibr" rid="ref38">38</xref>] and only consider predictions where the span exactly matches the ground truth. These metrics allow us to estimate how closely the model fits the GS.</p>
          <p>Relaxed metrics [<xref ref-type="bibr" rid="ref39">39</xref>] accept partial matches or extra tokens as long as at least 1 token of the predicted span overlaps with the GS span. This variant allows assessing the model’s capability of identifying the presence of concepts of interest in the text.</p>
        </sec>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>In this study, an annotation process was conducted with the help of human participants. All annotators were provided with detailed information about the purpose, methods, and potential uses of the data they produced, and their informed consent was obtained.</p>
        <p>To ensure the privacy of all the patients related to the medical data used in this study, we selected a data set already fully anonymized.</p>
        <p>As this research did not use personally identifiable information, it was exempt from institutional review board approval in accordance with the Ethical Guidelines for Medical and Health Research Involving Human Subjects stipulated by the Japanese national government (Chapter 1, Part 3, 1C) [<xref ref-type="bibr" rid="ref40">40</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Annotation Method Efficiency</title>
        <p>Upon merging the data received from the annotators, we produced the final version of the annotated corpus for each one of the methodologies. <xref ref-type="table" rid="table4">Table 4</xref> shows some statistics of the produced corpora.</p>
        <p>There is no substantial difference between <italic>traditional</italic> and <italic>lenient span</italic> methods when comparing the average length of the produced annotation. However, both produced annotations slightly larger than the gold annotations due to the disagreement resolution approach adopted in this study.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Statistics of the produced corpora.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="450"/>
            <col width="250"/>
            <col width="300"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td>Total annotations</td>
                <td>Average annotation length (character)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Gold standard</td>
                <td>1167</td>
                <td>6.31</td>
              </tr>
              <tr valign="top">
                <td>Traditional</td>
                <td>1065</td>
                <td>7.30</td>
              </tr>
              <tr valign="top">
                <td>Lenient span</td>
                <td>1012</td>
                <td>7.30</td>
              </tr>
              <tr valign="top">
                <td>Point</td>
                <td>1066</td>
                <td>—<sup>a</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <sec>
          <title>Annotation Quality</title>
          <p><xref ref-type="table" rid="table5">Table 5</xref> shows the average of GSC annotations covered by each corpus.</p>
          <p>Although none of the methodologies captured all the ground truth concepts, the percentage of entities captured was similar for every method, with less than a 10% (73 annotations) difference between the best (lenient span) and worst (point).</p>
          <p>As the value of missed entities is consistent for all methodologies, we attribute it to some divergence between the guidelines for annotating the GSC and the one used in this study. Differences in the interpretation may have led the annotators to skip some of the entities.</p>
          <p>We noticed that the traditional methodology presented a more constant accuracy throughout the annotators, while the boundary-relaxed methods had more variation, especially for annotators C and D.</p>
          <p><xref rid="figure5" ref-type="fig">Figure 5</xref> presents the accuracy of the annotations of each participant in relation to GSC on each methodology.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Average number of correctly annotated gold standard (GS) entities per annotation method.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Method</td>
                  <td>Annotated GS entities, n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Traditional</td>
                  <td>819 (83.56)</td>
                </tr>
                <tr valign="top">
                  <td>Lenient span</td>
                  <td>796 (83.65)</td>
                </tr>
                <tr valign="top">
                  <td>Point</td>
                  <td>746 (77.41)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Annotation accuracy per annotator. GS: gold standard.</p>
            </caption>
            <graphic xlink:href="medinform_v12i1e59680_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Annotation Time</title>
          <p>The time measurement results in <xref ref-type="table" rid="table6">Table 6</xref> demonstrate that both boundary-free annotation techniques can provide time-saving benefits. On average, reductions of around 25% (around 28 min) and 20% (around 21 min) were observed when using <italic>point</italic> and <italic>lenient span</italic> methods, respectively, compared to the <italic>traditional</italic> annotation process.</p>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>Comparison of the individual annotation time per annotation method<sup>a</sup>.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="220"/>
              <col width="270"/>
              <col width="290"/>
              <col width="220"/>
              <thead>
                <tr valign="top">
                  <td>Annotator</td>
                  <td>Traditional</td>
                  <td>Lenient span</td>
                  <td>Point</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>A</td>
                  <td>1:23:44</td>
                  <td>1:03:23 24%)</td>
                  <td>0:54:35 (–35%)</td>
                </tr>
                <tr valign="top">
                  <td>B</td>
                  <td>1:09:14</td>
                  <td>0:52:07 (–25%)</td>
                  <td>0:48:45 (–30%)</td>
                </tr>
                <tr valign="top">
                  <td>C</td>
                  <td>3:16:58</td>
                  <td>2:10:20 (–34%)</td>
                  <td>2:15:27 (–31%)</td>
                </tr>
                <tr valign="top">
                  <td>D</td>
                  <td>1:10:23</td>
                  <td>1:31:29 (+30%)</td>
                  <td>1:10:40 (+0%)</td>
                </tr>
                <tr valign="top">
                  <td>Average</td>
                  <td>1:45:05</td>
                  <td>1:24:20 (–20%)</td>
                  <td>1:17:22 (–26%)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table6fn1">
                <p><sup>a</sup>Times are presented in the HH:MM:SS format, with the percentage comparison to the traditional method in parenthesis.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Interannotator Agreement</title>
          <p>As evidenced by the results presented in <xref ref-type="table" rid="table7">Table 7</xref>, the IAA measured for both boundary-free annotation methods overcame the <italic>Traditional</italic> methodology.</p>
          <p><italic>Point</italic> annotations recorded the lowest agreement due to the inherent low probability of annotators precisely pinpointing the exact same position within an NE. Despite that, it achieves the highest measured agreement using the adjusted variant of the metrics.</p>
          <table-wrap position="float" id="table7">
            <label>Table 7</label>
            <caption>
              <p>Average interannotator agreement per annotation methodology.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Method</td>
                  <td>Cohen Kappa</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Traditional</td>
                  <td>0.731</td>
                </tr>
                <tr valign="top">
                  <td>Lenient span</td>
                  <td>0.774</td>
                </tr>
                <tr valign="top">
                  <td>Point</td>
                  <td>0.326</td>
                </tr>
                <tr valign="top">
                  <td>Point (adjusted)</td>
                  <td>0.811</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Downstream Task Performance</title>
        <p><xref ref-type="table" rid="table8">Table 8</xref> presents the NER model evaluation results.</p>
        <p>We trained a GSM using the GS data as a reference for our system’s best possible performance.</p>
        <p>The data produced in our annotation experiments probably have lower quality due to the lack of proper curation and review sessions. Thus, when comparing the <italic>Traditional</italic> annotation approach against the GSM, there is a slight decrease in performance: 15% and 11% on strict and relaxed metrics, respectively. Nevertheless, the relation between precision and recall remains the same, as both models were trained on similarly boundary-strict annotations.</p>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Evaluation of the trained named entity recognition models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="130"/>
            <col width="120"/>
            <col width="120"/>
            <col width="130"/>
            <col width="110"/>
            <col width="90"/>
            <thead>
              <tr valign="top">
                <td>Method</td>
                <td colspan="3">Strict</td>
                <td colspan="3">Relaxed</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Gold standard model</td>
                <td>0.72</td>
                <td>0.78</td>
                <td>0.75</td>
                <td>0.90</td>
                <td>0.89</td>
                <td>0.89</td>
              </tr>
              <tr valign="top">
                <td>Traditional</td>
                <td>0.60</td>
                <td>0.69</td>
                <td>0.64</td>
                <td>0.77</td>
                <td>0.81</td>
                <td>0.79</td>
              </tr>
              <tr valign="top">
                <td>Lenient span</td>
                <td>0.56</td>
                <td>0.54</td>
                <td>0.55</td>
                <td>0.67</td>
                <td>0.62</td>
                <td>0.64</td>
              </tr>
              <tr valign="top">
                <td>Point</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.60</td>
                <td>0.45</td>
                <td>0.51</td>
              </tr>
              <tr valign="top">
                <td>Point (expanded)</td>
                <td>0.34</td>
                <td>0.35</td>
                <td>0.35</td>
                <td>0.73</td>
                <td>0.71</td>
                <td>0.72</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Throughout the experiments, it was noticeable that simplifying the annotation process contributed to a more comfortable experience for the participants. We observed increased annotation speed, annotator agreement, and overall positive feedback from the annotators regarding the changes.</p>
        <p>Although we showcase our proposal in clinical data, the annotation methodologies are both domain and language-agnostic, so they can be applied to texts of different domains and idioms.</p>
        <sec>
          <title>Annotation Speed Improvements</title>
          <p>The results in <xref ref-type="table" rid="table6">Table 6</xref> show that simplifying the constraints under which annotators work can effectively increase the speed at which they execute the task. By virtually removing the need to decide on entity boundaries, both proposed methodologies allowed the annotation of our data set in less time than the <italic>traditional</italic> method.</p>
          <p>However, while an overall decreasing trend in annotation time was observed, different annotators experienced varying degrees of time reduction. Notably, annotator C experienced a significant increase in efficiency when using these methodologies. Conversely, annotator D was quicker with the <italic>traditional</italic> annotation scheme. Still, his precision was lower than other annotators, as shown by the individual accuracy results presented in <xref rid="figure5" ref-type="fig">Figure 5</xref>.</p>
        </sec>
        <sec>
          <title>Annotator Agreement Improvements</title>
          <p>Meanwhile, the IAA evaluation (<xref ref-type="table" rid="table7">Table 7</xref>) revealed some interesting insights into the annotation consistency of each methodology. Both the <italic>lenient span</italic> and the adjusted <italic>point</italic> agreement overcame the <italic>traditional</italic> methodology by 5.88% and 10.94%, respectively.</p>
          <p>While we believe that slightly different interpretations of what information should be annotated may have diminished <italic>traditional</italic> approach agreement, such a finding was still unexpected due to the higher flexibility given to the annotators when removing the need for entity boundaries. However, this improvement can be attributed to the ease with which annotators can consistently agree on the core parts of mentions (or the “main words”) compared to determining the precise boundaries of entire entities. Such boundaries may or may not encompass adjectives, modifiers, etc, which often contribute to annotation disagreements.</p>
          <p>Notably, <italic>point</italic> annotations perceived a large difference in the agreement values measured using the default and <italic>adjusted</italic> variants of the IAA metrics. This is explained by the fact that, even though it is virtually impossible for annotators to select the same character in an NE for all annotations, they generally selected positions close to each other for the same NE. Such finding is evidenced by the distribution of annotation pairs based on the number of characters of difference between them, as depicted in <xref ref-type="table" rid="table9">Table 9</xref>.</p>
          <p>Such a small distance is due to annotators’ diligence in positioning the annotation close to the center of the NE’s core word. As in the sentence shown in <xref rid="figure6" ref-type="fig">Figure 6</xref> (which translates to “Current symptoms: Diffuse dark red infiltration is observed on both cheeks.”), even though the span of the desired annotation is quite large, both annotators placed their labels near the most relevant set of words, “dark red infiltration.”</p>
          <table-wrap position="float" id="table9">
            <label>Table 9</label>
            <caption>
              <p>Distribution of annotation pairs based on the distance between them.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Number of characters of difference</td>
                  <td>Annotations, n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>0</td>
                  <td>661 (41.31)</td>
                </tr>
                <tr valign="top">
                  <td>1</td>
                  <td>640 (40.00)</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>145 (9.06)</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>52 (3.25)</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>25 (1.56)</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>15 (0.94)</td>
                </tr>
                <tr valign="top">
                  <td>6</td>
                  <td>15 (0.94)</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>14 (0.94)</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>11 (0.69)</td>
                </tr>
                <tr valign="top">
                  <td>9</td>
                  <td>15 (0.94)</td>
                </tr>
                <tr valign="top">
                  <td>≥10</td>
                  <td>7 (0.44)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <fig id="figure6" position="float">
            <label>Figure 6</label>
            <caption>
              <p>Example of 2 distinct point annotations in a named entity with a large span (underscored). The annotations are located near the center of the core word (in bold).</p>
            </caption>
            <graphic xlink:href="medinform_v12i1e59680_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Annotator’s Opinions</title>
          <p>Annotator feedback was positive especially regarding the <italic>point</italic> annotation, given its simplicity. The participants highlighted the easiness of the single-click selection mode, particularly due to the reduced mouse manipulation needed.</p>
          <p>However, the participants expressed difficulty in understanding the correctness of their annotations and whether the chosen range was indeed accurate. They felt that the soft boundaries displayed by the annotation tool turned the annotations ambiguous, making them unsure whether they matched the range they intended to select.</p>
        </sec>
        <sec>
          <title>Impacts on Model Performance</title>
          <p>While achieving significant improvements in annotators’ work quality, the additional flexibility from boundary-free methods considerably impacted model performance, particularly in strict evaluation, due to the imprecise training data, as seen in <xref ref-type="table" rid="table8">Table 8</xref>.</p>
          <p>The <italic>lenient span</italic>–trained model exhibited a significant subside in its recall, which hindered strict and relaxed evaluations. We did not expect that the ambiguity in NE boundaries could affect the model’s capability of locating NEs in the text.</p>
          <p>While such performance drop may be acceptable for some applications, we believe additional annotation postprocessing methods could restore the accuracy to levels similar to the <italic>traditional</italic> schema.</p>
        </sec>
        <sec>
          <title>Point-to-Span Estimation</title>
          <p>In particular, the insights from <italic>point</italic> annotation experiments underscore the potential of automated methods to supplement human annotations. We believe that <italic>point-to-span estimation</italic> can be pivotal for improving annotation speed, but beyond that, it can be proven beneficial to aid in addressing other annotation problems.</p>
          <p>Given the lackluster nature of the annotation task, it is not uncommon that annotators make mistakes, such as including punctuation markers or failing to label part of the NE simply for a lack of focus. The span estimation model can be a tool to “normalize” such annotations.</p>
          <p>Furthermore, the estimation could be integrated into the actual annotation process by coupling it with our annotation tool, enabling the “click-only” annotation interface to present the predicted span directly and allowing the annotator to correct its mistakes.</p>
          <p>However, there is potential for enhancements in the expansion model. Although expanding a point to the expected word seems to be a simple task, as we are evaluating our methods on a scriptio continua language, which makes the definition of the word boundaries not as obvious as in space-delimited languages, such as English.</p>
          <p>Through analysis of the model's output, we have observed that the estimation model exhibited a tendency to choose spans larger than the GS entities, particularly when characters that act like qualitative adjectives (such as “高” for high, “急性” for acute, “巨大” for huge) were connected to the concept of interest.</p>
          <p>For instance, the model outputted “高度の肝萎縮“ (Severe liver atrophy) instead of only “肝萎縮” (Liver atrophy). Another example was the expansion of the term “巨大な脾腎シャント“ (Giant splenorenal shunt), where 巨大な (Giant) was included.</p>
          <p>Yet, even though the model output in these examples can be regarded as “incorrect” when compared to the GSC, from a clinical point of view, it is not uncommon that some diseases are distinguished by such modifier words. For example, “急性胆嚢炎” (acute cholecystitis) and “慢性胆嚢炎 (chronic cholecystitis), which even have different International Classification of Diseases codes, K81.0 and K81.1, respectively.</p>
        </sec>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <p><xref rid="figure7" ref-type="fig">Figures 7</xref> and <xref rid="figure8" ref-type="fig">8</xref> present example comparisons between all the evaluated models in 2 different sentences.</p>
        <p>We could not identify any unusual behavior when inspecting the traditional annotation model output. Yet, we highlight that the lenient span model portrayed a tendency to overly extend the span lengths. In some cases (as shown especially in <xref rid="figure8" ref-type="fig">Figure 8</xref>), multiple NEs are “merged” into a single continuous extraction.</p>
        <p>As seen in both examples, the model trained with raw point annotations could not extract NE spans, denoting that the single position annotation contains insufficient information to train the model properly.</p>
        <p>In contrast, the model trained on expanded point annotations showcases the effectiveness of the <italic>point-to-span</italic> estimation method. Although strict metrics are still substantially lower than other approaches, relaxed results are comparable to the <italic>traditional</italic> annotation approach. The analysis of the model output evidenced that, while it could locate most concepts of interest, it struggled in correctly extracting multiword concepts.</p>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Comparison of model output for the sentence “While waiting for a CT scan, patient went into cardiopulmonary arrest (CA), but could not be resuscitated and died.” Gold standard entities and model extractions are marked in bold and underscored. White space tokenization was added to the Japanese text to enhance readability for non-Japanese readers. The original text does not contain spaces.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e59680_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Comparison of model output for the sentence “History of hypertension (HTN), diabetes, hyperlipidemia (HLD), or atrial fibrillation (AFib).” Gold standard entities and model extractions are marked in bold and underscored. White space tokenization was added to the Japanese text to enhance readability for non-Japanese readers. The original text does not contain spaces.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e59680_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>While our research focused on exploring novel approaches to text annotation and revealed promising findings, a few concerns and limitations need further investigation. Our investigations were only conducted in the Japanese language. Though our proposal is language independent, applying our techniques in a space-delimited language, such as English, could introduce some bias. Evaluation using different languages is, thus, encouraged. Since our data set in this study has an English variant, we plan to conduct additional experiments.</p>
        <p>We concentrated on a singular entity class, disease, and symptom names to streamline the analysis. Even though our texts contain a large number of entities, a single class annotation may not represent a real use case. Exploring our methodologies in a multiclass scenario would enhance the robustness of our findings and conclusions.</p>
        <p>Furthermore, we acknowledge that automated labeling techniques, such as preannotation, can affect the improvements observed in annotation time by adopting boundary-free methodologies. We chose not to incorporate these features in our annotation tool to minimize the number of variables affecting the annotation process.</p>
        <p>The observed performance of the trained NER models could have been impacted by our choice of using a simple and automatic approach to solve disagreements. Although it avoids additional annotator work and simplifies the research flow, implementing adjudication or review sessions with the annotations would be preferred, as it could have provided a better annotation quality.</p>
        <p>LLMs are prevalent in the current NLP research scenario, and their application has led to the development of systems that push state-of-the-art performance in many different tasks. In the current state of our work, we have not adopted LLMs. Still, we acknowledge that the accuracy of our methods may be improved by employing such methods in our workflow, possibly replacing the Point-to-span BERT model.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we investigated the effects of reducing the emphasis on entity boundary annotations while labeling NEs in a medical data set. We proposed 2 novel boundary-free annotation methodologies, <italic>lenient span</italic> and <italic>point</italic> annotation. We evaluated the impact of their application in an annotation process regarding annotation efficiency and the quality of the labeling produced.</p>
        <p>We also publicly released our developed annotation tool [<xref ref-type="bibr" rid="ref33">33</xref>] and point-to-span estimation model [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
        <p>Our results demonstrate a trade-off relation between annotation efficiency and model performance. Although not surprising, it unveils the weak points of each methodology and uncovers potential adjustments that can be made to each approach. We underscore that completely disregarding boundary information may ease the annotator’s work while it sacrifices performance to some extent.</p>
        <p>We plan to evaluate the proposed methodologies in other languages in future work. We also intend to explore the impact of postprocessing techniques, such as normalization or boundary regularization, to enhance model output performance.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AL</term>
          <def>
            <p>active learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations From Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">GS</term>
          <def>
            <p>gold standard</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">GSC</term>
          <def>
            <p>gold standard corpus</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GSM</term>
          <def>
            <p>gold standard model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">IAA</term>
          <def>
            <p>interannotator agreement</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">NE</term>
          <def>
            <p>named entity</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by Japan Science and Technology Agency (JST) Core Research for Evolutionary Science and Technology (CREST) grant JPMJCR22N1, Japan Society for the Promotion of Science (JSPS) Grants-in-Aid for Scientific Research (KAKENHI) grant JP19H01118, and Cross-Ministerial Strategic Innovation Promotion Program (SIP) on “Integrated Health Care System” grant JPJ012425, Japan.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during and/or analyzed during this study are available in the MedTxt-CR repository [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>GHBA designed the study, performed the computational experiments and data analysis, and wrote the manuscript. SY and EA discussed the results and reviewed the manuscript. EA supervised the study. All the authors have approved the final manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tayefi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ngo</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chomutare</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Dalianis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Salvi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Budrionis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Godtliebsen</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Challenges and opportunities beyond structured data in analysis of electronic health records</article-title>
          <source>WIREs Computational Stats</source>
          <year>2021</year>
          <month>02</month>
          <day>14</day>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>e1549</fpage>
          <pub-id pub-id-type="doi">10.1002/wics.1549</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gomes</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Correia</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ribeiro</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Freitas</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Effort estimation in named entity tagging tasks</article-title>
          <source>Proceedings of the 12th Conference on Language Resources and Evaluation</source>
          <year>2020</year>
          <month>05</month>
          <conf-name>LREC 2020</conf-name>
          <conf-date>May 11-16, 2020</conf-date>
          <conf-loc>Marseille, France</conf-loc>
          <fpage>998</fpage>
          <lpage>306</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.lrec-1.37"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Monajatipoor</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stremmel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Emami</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mohaghegh</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rouhsedaghat</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>LLMs in biomedicine: a study on clinical named entity recognition</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 10, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2404.07376</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marrero</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Urbano</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sánchez-Cuadrado</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Morato</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gómez-Berbís</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Named entity recognition: fallacies, challenges and opportunities</article-title>
          <source>Computer Standards Interfaces</source>
          <year>2013</year>
          <month>09</month>
          <volume>35</volume>
          <issue>5</issue>
          <fpage>482</fpage>
          <lpage>489</lpage>
          <pub-id pub-id-type="doi">10.1016/j.csi.2012.09.004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Hirschman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>D'Avolio</surname>
              <given-names>LW</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Overcoming barriers to NLP for clinical text: the role of shared tasks and the need for additional creative solutions</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2011</year>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>540</fpage>
          <lpage>543</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/21846785"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000465</pub-id>
          <pub-id pub-id-type="medline">21846785</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2011-000465</pub-id>
          <pub-id pub-id-type="pmcid">PMC3168329</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baledent</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mathet</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Widlöcher</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Couronne</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Manguin</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Validity, agreement, consensuality and annotated data quality</article-title>
          <source>Proceedings of the 13th Conference on Language Resources and Evaluation</source>
          <year>2022</year>
          <conf-name>LREC 2022</conf-name>
          <conf-date>June 20-25, 2022</conf-date>
          <conf-loc>Marseille, France</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2022.lrec-1.315"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Boundary smoothing for named entity recognition</article-title>
          <year>2022</year>
          <conf-name>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name>
          <conf-date>May 22-27, 2022</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Andrade</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yada</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aramaki</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Comparative evaluation of boundary-relaxed annotation for entity linking performance</article-title>
          <year>2023</year>
          <conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name>
          <conf-date>July 9-14, 2023</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ganchev</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pereira</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Mandel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>White</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Semi-automated named entity annotation</article-title>
          <year>2007</year>
          <conf-name>Proceedings of the Linguistic Annotation Workshop</conf-name>
          <conf-date>June 2007</conf-date>
          <conf-loc>Prague, Czech Republic</conf-loc>
          <fpage>53</fpage>
          <lpage>56</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W07-1509"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1642059.1642068</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Komiya</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Suzuki</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iwakura</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sasaki</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shinnou</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Comparison of methods to annotate named entity corpora</article-title>
          <source>ACM Trans Asian Low-Resour Lang Inf Process</source>
          <year>2018</year>
          <month>07</month>
          <day>21</day>
          <volume>17</volume>
          <issue>4</issue>
          <fpage>1</fpage>
          <lpage>16</lpage>
          <pub-id pub-id-type="doi">10.1145/3218820</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dasgupta</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Kalai</surname>
              <given-names>AT</given-names>
            </name>
          </person-group>
          <article-title>Analysis of perceptron-based active learning</article-title>
          <source>Learning Theory</source>
          <year>2005</year>
          <publisher-loc>Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>249</fpage>
          <lpage>263</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kholghi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sitbon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zuccon</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Active learning reduces annotation time for clinical concept extraction</article-title>
          <source>Int J Med Inform</source>
          <year>2017</year>
          <volume>106</volume>
          <fpage>25</fpage>
          <lpage>31</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2017.08.001</pub-id>
          <pub-id pub-id-type="medline">28870380</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(17)30200-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tokunaga</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nishikawa</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Iwakura</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>An eye-tracking study of named entity annotation</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017</conf-name>
          <conf-date>September 4-6, 2017</conf-date>
          <conf-loc>Varna, Bulgaria</conf-loc>
          <fpage>758</fpage>
          <lpage>764</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saxena</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sunkle</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kulkarni</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Hybrid search based enhanced named entity annotation tool</article-title>
          <source>Proceedings of the 15th Innovations in Software Engineering Conference</source>
          <year>2022</year>
          <conf-name>ISEC '22</conf-name>
          <conf-date>February 24-26, 2022</conf-date>
          <conf-loc>Gandhinagar, India</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Mitra</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Rahman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>MEGAnno+: a Hhuman-LLM Collaborative Annotation System</article-title>
          <year>2024</year>
          <conf-name>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations St. Julians</conf-name>
          <conf-date>March 17-22, 2024</conf-date>
          <conf-loc>St. Julians, Malta</conf-loc>
          <fpage>168</fpage>
          <lpage>176</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2024.eacl-demo.18"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gueta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gilon</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Erell</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Jaber</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kartha</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Steiner</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Laish</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Feder</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>LLMs accelerate annotation for medical information extraction</article-title>
          <year>2023</year>
          <conf-name>Proceedings of the 3rd Machine Learning for Health Symposium</conf-name>
          <conf-date>December 10, 2022</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <fpage>82</fpage>
          <lpage>100</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kholodna</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Julka</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Khodadadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gumus</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Granitzer</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>LLMs in the loop: leveraging large language model annotations for active learning in low-resource languages</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 2, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2404.02261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Beigi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bhattacharjee</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Karami</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Large language models for data annotation: a survey</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on February 21, 2024</comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sabou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bontcheva</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Derczynski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Scharl</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Corpus annotation through crowdsourcing: towards best practice guidelines</article-title>
          <source>Proceedings of the Ninth International Conference on Language Resources and Evaluation</source>
          <year>2014</year>
          <conf-name>LREC'14</conf-name>
          <conf-date>May 26-31, 2014</conf-date>
          <conf-loc>Reykjavik, Iceland</conf-loc>
          <fpage>859</fpage>
          <lpage>866</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.lrec-conf.org/proceedings/lrec2014/pdf/497_Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Snow</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jurafsky</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Cheap and fast - but is it good? evaluating non-expert annotations for natural language tasks</article-title>
          <year>2008</year>
          <conf-name>Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing Honolulu</conf-name>
          <conf-date>October 2008</conf-date>
          <conf-loc>Honolulu, HI</conf-loc>
          <fpage>254</fpage>
          <lpage>263</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D08-1027"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A comparative study on annotation quality of crowdsourcing and LLM via label aggregation</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on January 18, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2401.09760</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pangakis</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wolken</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fasching</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Automated annotation with generative AI requires validation</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 31, 2023</comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Noisy-labeled NER with confidence estimation</article-title>
          <year>2021</year>
          <conf-name>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 6-11, 2021</conf-date>
          <conf-loc>Online</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhuang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>DiffusionNER: boundary diffusion for named entity recognition</article-title>
          <year>2023</year>
          <conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name>
          <conf-date>July 9-14, 2023</conf-date>
          <conf-loc>Toronto, Canada</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yada</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nakamura</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wakamiya</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aramaki</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Real-MedNLP: Overview of real document-based medical natural language processing task</article-title>
          <source>Proceedings of the 16th NTCIR Conference on Evaluation of Information Access Technologies</source>
          <year>2022</year>
          <conf-name>NTCIR 16 Conference</conf-name>
          <conf-date>June 14-17, 2022</conf-date>
          <conf-loc>Tokyo, Japan</conf-loc>
          <fpage>285</fpage>
          <lpage>296</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://research.nii.ac.jp/ntcir/workshop/OnlineProceedings16/pdf/ntcir/01-NTCIR16-OV-MEDNLP-YadaS.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AEW</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <fpage>160035</fpage>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mahajan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Tsou</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Toward understanding clinical context of medication change events in clinical narratives</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2021</year>
          <volume>2021</volume>
          <fpage>833</fpage>
          <lpage>842</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35308981"/>
          </comment>
          <pub-id pub-id-type="medline">35308981</pub-id>
          <pub-id pub-id-type="pii">3577060</pub-id>
          <pub-id pub-id-type="pmcid">PMC8861744</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nishiyama</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nishidani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ando</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yada</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wakamiya</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aramaki</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>NAISTSOC at the NTCIR-16 real-medNLP task</article-title>
          <source>Proceedings of the 16th NTCIR Conference on Evaluation of Information Access Technologies</source>
          <year>2022</year>
          <conf-name>NTCIR 16 Conference</conf-name>
          <conf-date>June 14-17, 2022</conf-date>
          <conf-loc>Tokyo, Japan</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://research.nii.ac.jp/ntcir/workshop/OnlineProceedings16/pdf/ntcir/07-NTCIR16-MEDNLP-NishiyamaT.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spasic</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Nenadic</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Clinical text data in machine learning: systematic review</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>e17984</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/3/e17984/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17984</pub-id>
          <pub-id pub-id-type="medline">32229465</pub-id>
          <pub-id pub-id-type="pii">v8i3e17984</pub-id>
          <pub-id pub-id-type="pmcid">PMC7157505</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yada</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Joh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tanaka</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Aramaki</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kurohashi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Towards a versatile medical-annotation guideline feasible without heavy medical knowledge: starting from critical lung diseases</article-title>
          <source>Proceedings of the 12th Conference on Language Resources and Evaluation</source>
          <year>2020</year>
          <conf-name>LREC 2020</conf-name>
          <conf-date>May 11-16, 2020</conf-date>
          <conf-loc>Marseille, France</conf-loc>
          <fpage>4565</fpage>
          <lpage>4572</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.lrec-1.561"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ollion</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Macanovic</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chatelain</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT for text annotation? Mind the hype!</article-title>
          <source>SocArXiv</source>
          <comment>Preprint posted online on October 4, 2023</comment>
          <pub-id pub-id-type="doi">10.31235/osf.io/x58kn</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of Deep bidirectional transformers for language understanding</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on October 1, 2018</comment>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <article-title>Fuzzy annotation tool</article-title>
          <source>GitHub</source>
          <access-date>2024-04-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/gabrielandrade2/FuzzyAnnotationTool">https://github.com/gabrielandrade2/FuzzyAnnotationTool</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <article-title>Point-to-span estimation BERT model</article-title>
          <source>GitHub</source>
          <year>2023</year>
          <access-date>2024-04-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/gabrielandrade2/Point-to-Span-estimation">https://github.com/gabrielandrade2/Point-to-Span-estimation</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tohoku</surname>
              <given-names>NG</given-names>
            </name>
          </person-group>
          <article-title>BERT base Japanese (character-level tokenization with whole word masking, jawiki-20200831)</article-title>
          <source>Hugging Face</source>
          <access-date>2024-04-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/tohoku-nlp/bert-base-japanese-char-v2">https://huggingface.co/tohoku-nlp/bert-base-japanese-char-v2</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A coefficient of agreement for nominal scales</article-title>
          <source>Educ Psychological Meas</source>
          <year>2016</year>
          <month>07</month>
          <day>02</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>37</fpage>
          <lpage>46</lpage>
          <pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Warrens</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>Five ways to look at Cohen's Kappa</article-title>
          <source>J Psychol Psychother</source>
          <year>2015</year>
          <volume>05</volume>
          <issue>04</issue>
          <fpage>1</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.4172/2161-0487.1000197</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tjong</surname>
              <given-names>KSE</given-names>
            </name>
            <name name-style="western">
              <surname>De</surname>
              <given-names>MF</given-names>
            </name>
          </person-group>
          <article-title>Introduction to the conll-2003 shared task: language-independent named entity recognition</article-title>
          <source>Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003</source>
          <year>2003</year>
          <conf-name>CoNLL-2003</conf-name>
          <conf-date>May 31-June 1, 2003</conf-date>
          <conf-loc>Edmonton, AB</conf-loc>
          <fpage>142</fpage>
          <lpage>147</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W03-0419"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ghiasvand</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Kate</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Learning for clinical named entity recognition without manual annotations</article-title>
          <source>Inform Med Unlocked</source>
          <year>2018</year>
          <volume>13</volume>
          <fpage>122</fpage>
          <lpage>127</lpage>
          <pub-id pub-id-type="doi">10.1016/j.imu.2018.10.011</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="web">
          <article-title>Ethical Guidelines for Medical and Health Research Involving Human Subjects</article-title>
          <source>Ministry of Health, Labor and Welfare</source>
          <access-date>2024-04-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mhlw.go.jp/file/06-Seisakujouhou-10600000-Daijinkanboukouseikagakuka/0000080278.pdf">https://www.mhlw.go.jp/file/06-Seisakujouhou-10600000-Daijinkanboukouseikagakuka/0000080278.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
