<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i8e18089</article-id>
      <article-id pub-id-type="pmid">32749222</article-id>
      <article-id pub-id-type="doi">10.2196/18089</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Assessment of the Robustness of Convolutional Neural Networks in Labeling Noise by Using Chest X-Ray Images From Multiple Centers</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Lim</surname>
            <given-names>Gilbert</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Lin</surname>
            <given-names>Chin</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Jang</surname>
            <given-names>Ryoungwoo</given-names>
          </name>
          <degrees>BA, MD, MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1511-7469</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>Namkug</given-names>
          </name>
          <degrees>BA, MA, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Department of Convergence Medicine</institution>
            <institution>Asan Medical Center</institution>
            <institution>University of Ulsan College of Medicine</institution>
            <addr-line>88 Olympic-Ro 43-Gil, Songpa-Gu, Seoul, Korea</addr-line>
            <addr-line>Seoul</addr-line>
            <country>Republic of Korea</country>
            <phone>82 10 3017 4282</phone>
            <email>namkugkim@gmail.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3438-2217</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Jang</surname>
            <given-names>Miso</given-names>
          </name>
          <degrees>BA, MA, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4409-411X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Kyung Hwa</given-names>
          </name>
          <degrees>BA, MA, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3826-8451</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Sang Min</given-names>
          </name>
          <degrees>BA, MA, MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7627-2000</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Kyung Hee</given-names>
          </name>
          <degrees>BA, MA, MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2248-2525</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Noh</surname>
            <given-names>Han Na</given-names>
          </name>
          <degrees>BA, MA, MD, PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6887-7878</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Seo</surname>
            <given-names>Joon Beom</given-names>
          </name>
          <degrees>BA, MA, MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0271-7884</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biomedical Engineering</institution>
        <institution>College of Medicine</institution>
        <institution>University of Ulsan</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Convergence Medicine</institution>
        <institution>Asan Medical Center</institution>
        <institution>University of Ulsan College of Medicine</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Radiology</institution>
        <institution>Asan Medical Center</institution>
        <institution>University of Ulsan College of Medicine</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Radiology</institution>
        <institution>Seoul National University Bundang Hospital</institution>
        <institution>Seoul National University College of Medicine</institution>
        <addr-line>Seongnam</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Health Screening and Promotion Center</institution>
        <institution>Asan Medical Center</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Namkug Kim <email>namkugkim@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>8</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>4</day>
        <month>8</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>8</issue>
      <elocation-id>e18089</elocation-id>
      <history>
        <date date-type="received">
          <day>2</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>6</day>
          <month>5</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>8</day>
          <month>6</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>21</day>
          <month>6</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Ryoungwoo Jang, Namkug Kim, Miso Jang, Kyung Hwa Lee, Sang Min Lee, Kyung Hee Lee, Han Na Noh, Joon Beom Seo. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 04.08.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2020/8/e18089" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Computer-aided diagnosis on chest x-ray images using deep learning is a widely studied modality in medicine. Many studies are based on public datasets, such as the National Institutes of Health (NIH) dataset and the Stanford CheXpert dataset. However, these datasets are preprocessed by classical natural language processing, which may cause a certain extent of label errors.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to investigate the robustness of deep convolutional neural networks (CNNs) for binary classification of posteroanterior chest x-ray through random incorrect labeling.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We trained and validated the CNN architecture with different noise levels of labels in 3 datasets, namely, Asan Medical Center-Seoul National University Bundang Hospital (AMC-SNUBH), NIH, and CheXpert, and tested the models with each test set. Diseases of each chest x-ray in our dataset were confirmed by a thoracic radiologist using computed tomography (CT). Receiver operating characteristic (ROC) and area under the curve (AUC) were evaluated in each test. Randomly chosen chest x-rays of public datasets were evaluated by 3 physicians and 1 thoracic radiologist.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In comparison with the public datasets of NIH and CheXpert, where AUCs did not significantly drop to 16%, the AUC of the AMC-SNUBH dataset significantly decreased from 2% label noise. Evaluation of the public datasets by 3 physicians and 1 thoracic radiologist showed an accuracy of 65%-80%.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The deep learning–based computer-aided diagnosis model is sensitive to label noise, and computer-aided diagnosis with inaccurate labels is not credible. Furthermore, open datasets such as NIH and CheXpert need to be distilled before being used for deep learning–based computer-aided diagnosis.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>deep learning</kwd>
        <kwd>convolutional neural network</kwd>
        <kwd>NIH dataset</kwd>
        <kwd>CheXpert dataset</kwd>
        <kwd>robustness</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Posteroanterior chest x-ray (CXR) is one of the most widely used methods to evaluate a subject’s chest. CXR is low cost and easy to assess and acquire, and it provides a variety of information. Researchers developed computer-aided diagnosis (CAD) algorithms for CXRs because of the substantial presence of CXRs in large hospitals and medical centers [<xref ref-type="bibr" rid="ref1">1</xref>]. At present, there are no widely used clinically meaningful CAD algorithms with classical image processing algorithms. However, the success of deep learning has led to the development of deep learning–based CXR CAD algorithms [<xref ref-type="bibr" rid="ref2">2</xref>]. Among the various types of deep learning algorithms, the convolutional neural network (CNN) is the most widely used technique for CXR classification.</p>
      <p>Before applying CNN to CAD development, we need to consider the robustness of CNN for inaccurate datasets. It is believed that CNN is robust to label noise [<xref ref-type="bibr" rid="ref3">3</xref>]. Conversely, clean labels and accurate datasets are considered necessary conditions for CNN-based classification. However, the differences in complexity between datasets from Modified National Institute of Standards and Technology (MNIST) and CXRs were enormous. The MNIST images had a size of 28×28 pixels, whereas the image sizes in CXR datasets were mostly above 1024×1024 pixels. Therefore, relying on the robustness of deep learning alone for CXR datasets would be insufficient. Some [<xref ref-type="bibr" rid="ref3">3</xref>] asserted that accuracy over 90% with 0% noisy labels is not very different from an approximate accuracy of 85% with 90% noisy labels. However, in medicine, an accurate diagnosis is essential for appropriate treatment, and even a 1% decrease in accuracy cannot be tolerated.</p>
      <p>Since open CXR datasets from the National Institutes of Health (NIH) and Stanford CheXpert are preprocessed using natural language processing, they tend to contain [<xref ref-type="bibr" rid="ref4">4</xref>] a certain extent of wrong and uncertain labels [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Several groups studied the effect of label noise in the CNN classification model. Rolnick et al [<xref ref-type="bibr" rid="ref3">3</xref>] claimed that CNNs are robust to massive label noise. Beigman and Beigman [<xref ref-type="bibr" rid="ref4">4</xref>], Guan et al [<xref ref-type="bibr" rid="ref7">7</xref>], Lee et al [<xref ref-type="bibr" rid="ref8">8</xref>], Choi et al [<xref ref-type="bibr" rid="ref9">9</xref>], and Sukhbaatar and Fergus [<xref ref-type="bibr" rid="ref10">10</xref>] attempted to develop models from noisy datasets directly. Others such as Brodley and Friedl [<xref ref-type="bibr" rid="ref11">11</xref>] identified and reduced noisy data using majority voting before training. This research claims that they can make a model robust for up to 30% of label noise. This type of research is subject to the risk of classifying hard labels as noisy labels. To overcome this problem, some researchers attempted to combine noisy data with accurate datasets, as proposed by Zhu [<xref ref-type="bibr" rid="ref12">12</xref>]. When the label noise was provided, Bootkrajang and Kabán [<xref ref-type="bibr" rid="ref13">13</xref>] proposed a generic unbiased estimator for binary classification. Unlike electronic health records, images can be re-reported any time with domain experts’ efforts. There are several studies that analyzed electronic health records using natural language processing techniques [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
      <p>Many have attempted to classify CXR with deep learning techniques. Rajpurkar et al [<xref ref-type="bibr" rid="ref5">5</xref>] proposed a CNN-based CXR classifier with an overall area under the curve (AUC) ranging between 0.8 and 0.93. Yao et al [<xref ref-type="bibr" rid="ref16">16</xref>] used a similar method to classify multiclass CXR. Pesce et al [<xref ref-type="bibr" rid="ref17">17</xref>] used over 430,000 CXRs and proposed an architecture with attention structure based on the evidence that deep learning is robust to label noise [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>The questions raised were “Are noisy and wrong-labeled datasets credible?” and “Can we believe a CAD model that used these open datasets during training?” In this study, we contemplate the credibility of these datasets and the effect of label noise during training. The aim of this study is threefold: (1) to train computed tomography (CT)-confirmed CXR datasets from Asan Medical Center (AMC) and Seoul National University Bundang Hospital (SNUBH), which can be considered clean with an intentionally given label noise of 0%, 1%, 2%, 4%, 8%, 16%, and 32%; (2) to train NIH and CheXpert datasets, which are considered noisy with an intentionally given label noise of 0%, 1%, 2%, 4%, 8%, 16%, and 32%; and (3) to have the NIH and CheXpert datasets re-evaluated by 3 physicians and one radiologist.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Image Dataset</title>
        <p>Our CXRs were collected from 2 hospitals, AMC and SNUBH in South Korea. Data from 2011 to 2016 were collected. Every CXR was confirmed with its nearest corresponding CT scan and was reevaluated by a chest radiologist with more than 20 years of experience. CXRs contained 5 clinically relevant disease categories, namely, nodule (ND), consolidation (CS), interstitial opacity (IO), pleural effusion (PLE), and pneumothorax (PT). These categories were classified into 2 classes, normal and abnormal. A detailed description of our dataset is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>Descriptions of the NIH and the CheXpert datasets can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendices 2</xref> and <xref ref-type="supplementary-material" rid="app3">3</xref> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. To validate the NIH and CheXpert datasets, we randomly sampled the same number of normal and abnormal images from the NIH and CheXpert datasets as that from our dataset, that is, all 3 datasets were sampled to have 7103 no finding images and 8680 abnormal images. In the NIH dataset, images were classified into 15 categories including a “no finding” category. For the NIH dataset, we did not distinguish each disease category, but unified all the disease categories into 1 class, “abnormal”. In the CheXpert dataset, images were classified into 14 categories including “no finding.” In each image class, every image was subclassified as positive/uncertain/negative. We did not use positive/uncertain/negative because the uncertain class can be confusing and negative images were not clinically important. Instead, 14 positive-labeled disease categories were classified as “abnormal,” and the “no finding” category was classified as “normal” in the CheXpert dataset. Because there were disease categories present in the CheXpert dataset, which were not in our dataset or the NIH dataset, we unified every disease class as “abnormal” and considered “no finding” as “normal.” Furthermore, the “abnormal” class was randomly sampled to be the same number as our “abnormal” dataset without considering the number of each disease class. These “no finding” and “abnormal” dataset descriptions are presented in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Brief description of the datasets of Asan Medical Center and Seoul National University Bundang Hospital, National Institutes of Health, and CheXpert.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="260"/>
            <col width="240"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>
                 Distribution of images
                </td>
                <td>AMC<sup>a</sup> and SNUBH<sup>b</sup> dataset</td>
                <td>NIH<sup>c</sup> dataset</td>
                <td>CheXpert dataset</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Number of no-finding or normal images</td>
                <td>7103</td>
                <td>60,361</td>
                <td>22,419</td>
              </tr>
              <tr valign="top">
                <td>Number of abnormal images</td>
                <td>8680</td>
                <td>51,759</td>
                <td>201,897</td>
              </tr>
              <tr valign="top">
                <td>Number of total images</td>
                <td>15,783</td>
                <td>112,120</td>
                <td>224,316</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>AMC: Asan Medical Center.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>SNUBH: Seoul National University Bundang Hospital.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>NIH: National Institutes of Health.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>After random shuffling, we analyzed the distribution of 3 randomly shuffled datasets. The distributions of these randomly shuffled datasets are shown in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p>
        <p>The label quality of public data from open datasets was evaluated by 3 licensed nonradiologist physicians and 1 board-certified radiologist. For the 3 nonradiologists, in each of the CheXpert and the NIH dataset, we randomly sampled 100 images. In the NIH dataset, 25 images were “abnormal” and 75 images were “no finding.” In CheXpert, 85 images were “abnormal” and 15 images were “normal.” For the radiologist, we randomly selected 200 images from each public dataset. The board-certified radiologist evaluated each given dataset twice, and we recorded the concordance rate for the 2 evaluations. For each open dataset, these images were passed to 3 physicians and 1 radiologist, who reported whether each image belonged to the “no finding” or “abnormal” category.</p>
      </sec>
      <sec>
        <title>Image Preprocessing</title>
        <p>Every CXR image from the NIH and CheXpert datasets was stored in an 8-bit PNG format. To feed the images in the training model, we changed 3- or 4-channel PNG images to grayscale. The 12-bit DICOM (Digital Imaging and Communications in Medicine) files in our dataset were converted into 8-bit gray PNG format, for which we attempted to set a consistent training condition. In open datasets, sizes of images differed from image to image. To solve this problem, we unified the image size to be 1024×1024 pixels. Similarly, our DICOM images were resized from approximately 2000×2000 pixels to 1024×1024 pixels. Bilinear interpolation was used to resize images, and min-max scaling was applied to each image so that every pixel had a value in the range of 0-1. All the processing was performed using the opencv-python package by Olli-Pekka Heinisuo.</p>
      </sec>
      <sec>
        <title>Training Details</title>
        <p>Each dataset was classified into 3 groups: training, validation, and test sets. The detailed composition of our dataset including the training, validation, and test sets is presented in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. Among the various CNN models, CheXNet by Rajpurkar et al [<xref ref-type="bibr" rid="ref5">5</xref>] was selected as the baseline model. CheXNet is a 121-layered Densenet [<xref ref-type="bibr" rid="ref19">19</xref>] with 14 disease categories. We changed the last fully connected layer to 1 node to simplify the classification into normal and abnormal. We trained CheXNet from scratch without using the pretrained model. Labels of each training dataset were intentionally misrepresented with rates of 0%, 1%, 2%, 4%, 8%, 16%, and 32%. To generate a training set to have every label noise, we first randomly shuffled all the datasets and changed the label of images in the shuffled list in order from the front. The order was shuffled again to distribute the misrepresented label data evenly in the entire training set. We used Keras python package and Adam optimizer [<xref ref-type="bibr" rid="ref20">20</xref>] with a learning rate of 0.0001. The loss was set to be binary cross-entropy, and we measured the accuracy with a threshold of 0.5. We trained 20 epochs for each label noise level and each dataset. The training was conducted with a NVIDIA GeForce RTX 2070 for approximately 3 days for each dataset. Moreover, we did not apply label noises for the validation and test sets.</p>
      </sec>
      <sec>
        <title>Evaluation Metric and Statistics</title>
        <p>For inference, we selected the model with the smallest validation loss in each dataset. In each test set of datasets, we evaluated receiver operating characteristics (ROC) and AUC. The inference results were compared using a semi-log plot. Subsequently, AUC of 0% was compared with each noise level, using standard error defined by Hanley and McNeil [<xref ref-type="bibr" rid="ref21">21</xref>]. The SE is defined as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i8e18089_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where auc is AUC, n<sub>a</sub> is the number of abnormal images, and n<sub>n</sub> is the number of normal images, <inline-graphic xlink:href="medinform_v8i8e18089_fig8.png" xlink:type="simple" mimetype="image"/>
       and
       <inline-graphic xlink:href="medinform_v8i8e18089_fig9.png" xlink:type="simple" mimetype="image"/></p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Accuracies of Each Label Noise</title>
        <p>After training 3 datasets with the CNN architecture, ROC curves were drawn as depicted in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> illustrates a semilog plot of AUCs of ROC curves from our dataset, the NIH dataset, and the CheXpert dataset for every noise level. Each vertical line means standard error for given AUC.</p>
        <p>In the NIH and the CheXpert datasets AUC was poorer than that in our dataset at 0% label noise. The AUC of our dataset was more sensitive to label noise than that of the NIH and the CheXpert datasets. F1 scores are plotted in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Receiver operating characteristic (ROC) curves for datasets of Asan Medical Center and Seoul National University Bundang Hospital, National Institutes of Health, and CheXpert (from left to right) with each label noise rate (0%, 1%, 2%, 4%, 8%, 16%, and 32%).</p>
          </caption>
          <graphic xlink:href="medinform_v8i8e18089_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Semilog plot of area under the curves (AUC) of receiver operating characteristic (ROC) curves in the datasets of Asan Medical Center and Seoul National University Bundang Hospital, National Institutes of Health, and CheXpert (from left to right) with each label noise rate (0%, 1%, 2%, 4%, 8%, 16%, and 32%).</p>
          </caption>
          <graphic xlink:href="medinform_v8i8e18089_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>F1 scores of the datasets of Asan Medical Center and Seoul National University Bundang Hospital, National Institutes of Health, and CheXpert (from left to right).</p>
          </caption>
          <graphic xlink:href="medinform_v8i8e18089_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The ROC comparisons for the 3 datasets are presented in <xref ref-type="table" rid="table2">Table 2</xref>. It became statistically significant when noise level became 2% in our dataset. However, in the NIH and CheXpert datasets, there was no statistical significance until 16% of noise was observable in the training set.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Receiver operating characteristic (ROC) comparison for the datasets of Asan Medical Center and Seoul National University Bundang Hospital, National Institutes of Health, and CheXpert.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="260"/>
            <col width="340"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Dataset and label noise level (%)</td>
                <td>Difference of AUC<sup>a</sup> with respect to 0%</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>AMC<sup>b</sup></bold>
                  <bold>and SNUBH<sup>c</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td>1</td>
                <td>0.08</td>
                <td>.08</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>0.097</td>
                <td>.04</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>0.107</td>
                <td>.02</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>0.118</td>
                <td>.007</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>0.197</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>32</td>
                <td>0.176</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>NIH<sup>d</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td>1</td>
                <td>–0.012</td>
                <td>.74</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>–0.020</td>
                <td>.58</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>–0.041</td>
                <td>.24</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>0.031</td>
                <td>.37</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>0.014</td>
                <td>.68</td>
              </tr>
              <tr valign="top">
                <td>32</td>
                <td>0.111</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>CheXpert</bold>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="6">
                  <break/>
                </td>
                <td>1</td>
                <td>–0.005</td>
                <td>.91</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>0.003</td>
                <td>.99</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>0.005</td>
                <td>.90</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>0.048</td>
                <td>.86</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>0.022</td>
                <td>.94</td>
              </tr>
              <tr valign="top">
                <td>32</td>
                <td>0.028</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>AUC: area under the curve.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>AMC: Asan Medical Center.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>SNUBH: Seoul National University Bundang Hospital.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>NIH: National Institutes of Health.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>For our dataset, we analyzed subgroups of abnormal cases. It is shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
        <p>There were 1413 normal CXRs, 449 ND CXRs, 322 CS CXRs, 261 IO CXRs, 548 PLE CXRs, 298 PT CXRs in our test set. We joined 1413 normal data with each disease subclass and performed ROC curve analysis. For overall subgroups including ND, CS, IO, PLE, PT, there was no distinguishing subgroup, which was much more sensitive to label noise. However, among these classes, IO was most robust to label noise, showing low decline of AUCs.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Subgroup analysis of abnormal cases in the dataset of Asan Medical Center and Seoul National University Bundang Hospital.</p>
          </caption>
          <graphic xlink:href="medinform_v8i8e18089_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Visual Scoring of Open Dataset</title>
        <p>The NIH and the CheXpert datasets were reevaluated by 3 nonradiologist licensed physicians and 1 radiologist. The physicians evaluated CXRs once for each doctor, and the radiologist evaluated CXRs twice. The 3 physicians rated the accuracy of the NIH dataset as 75% (75/100), 65% (65/100), and, 84% (84/100), and that of the CheXpert dataset as 65% (65/100), 77% (77/100) and 61% (61/100), respectively. The radiologist who evaluated CXRs twice rated the accuracy of NIH dataset as 67.5% (135/200) and 65 % (130/200) for each evaluation and rated the accuracy of CheXpert dataset as 81% (162/200) and 77% (154/200) for each evaluation. The concordance rates of 2 evaluations for 2 datasets were 92% (184/200) and 56% (112/200) for the NIH and CheXpert datasets, respectively. <xref rid="figure5" ref-type="fig">Figure 5</xref> depicts the sensitivity and specificity of the report of the 3 physicians. First row is the result of visual scoring by 3 physicians for the NIH dataset, and the second row is the result of visual scoring by 3 physicians for the CheXpert (Stanford) dataset.</p>
        <p><xref rid="figure6" ref-type="fig">Figure 6</xref> shows the accuracy, sensitivity, specificity of 2 evaluations of 1 radiologist with the concordance rate of 2 evaluations. One radiologist had visually scored 2 public datasets twice. First and second columns from the left show the result of visual scoring for the public datasets. The third column is about concordance rate for the 2 visual scorings for each dataset.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Visual scoring by 3 licensed physicians. Pred: predicted; Abnl: abnormal; NL: normal; NIH: National Institutes of Health; Acc: accuracy.</p>
          </caption>
          <graphic xlink:href="medinform_v8i8e18089_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Visual scoring of thoracic radiologist over a 20-year experience. Pred: predicted; Abnl: abnormal; NL: normal; NIH: National Institutes of Health; Acc: accuracy.</p>
          </caption>
          <graphic xlink:href="medinform_v8i8e18089_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>The results of our dataset reveal that the CNN architecture is extremely sensitive to label noise. However, the results of the NIH and CheXpert datasets demonstrate that open datasets are robust to label noise, suggesting that the NIH and CheXpert datasets essentially contain label noises. These datasets do not significantly change the label noise levels and yield robustness despite the label noise. Therefore, training open datasets with CNN architectures has several drawbacks. First, CheXNet cannot be trained in the NIH dataset, because of extensive noise level of NIH dataset. Since open datasets were processed with classical natural language processing, abnormal CXRs were reported to have “no interval change” can be categorized as “no findings.” This can amplify label noise of open datasets.</p>
      <p>Furthermore, the “no finding” category does not imply normal. There were 15 classes in NIH classified as “no finding,” and 14 classes in CheXpert classified as “no finding,” suggesting that other lesions may be categorized as “no finding.” For example, cavity due to tuberculosis, reticular pattern due to diffuse interstitial lung diseases, hyperinflation due to chronic obstructive lung diseases could be classified as “no finding.” Rajpurkar et al [<xref ref-type="bibr" rid="ref5">5</xref>] reported the CheXNet performance to be similar to that of a radiologist in categorizing pneumonia, rather than a “no finding” category, possibly caused by label noises and/or due to the insufficient performance of CheXNet for differentiating “no finding” and “abnormal.” Therefore, labeling with natural language processing is not suitable for CXR CAD model development. Rating accuracies of our 3 physicians on “no finding” and “abnormal” was approximately 60%-80%, and the accuracy of confirmation by 1 radiologist for the NIH and CheXpert dataset was around 60% and 80%, respectively, which implies that these open datasets have a high occurrence of mislabeled data. The concordance rate of 1 radiologist was 92% (184/200) for NIH and 56% (112/200) for CheXpert. This low concordance rate for CheXpert may have originated from blurry texture of CheXpert images.</p>
      <p>To analyze their performance, we experimented the ability of corrected test set of open datasets. First, after the radiologist’s 2-time confirmation, we tested corrected labels using weights of model that were trained with each label noise. The result is shown in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. Due to the massive label noise of NIH dataset, CheXNet does not work properly for each model of label noise. In CheXpert settings, situation is little bit better yet performance was poor as expected.</p>
      <p>There could be an array of additional issues that affect the quality of the open datasets. The CheXpert and NIH datasets are 8-bit PNG image files. Therefore, information loss is unavoidable during conversion from 12-bit DICOM files to the PNG image format.</p>
      <p>Robustness of the CheXNet model trained by the NIH and CheXpert datasets does not translate to the robustness of the CNN architecture. The results of our dataset show that CNN is not robust to the noise level. Rather, robustness of the models trained by open datasets can be considered a result of their original impurity. The open datasets are not well-preprocessed, leading them to contain label errors to a certain extent. A low level of label noise does not visibly affect the impurity, and accuracy seems to endure up to 16%.</p>
      <p>Regardless of these drawbacks, CNN is considered the best tool for CAD development. Our study urges CAD developers to maximize their effort in accumulating extremely high-quality datasets.</p>
      <p>Our study has several limitations. First, we considered only 1 network, CheXNet. Other networks such as ChoiceNet can be robust to label noise [<xref ref-type="bibr" rid="ref9">9</xref>]. Second, a well-performing model that is robust to label noise is not indicative of its tolerability towards label noise in open datasets. Using open datasets commercially or for research must be seriously considered. Unlike MNIST, they have considerable impacts on the diagnosis of each patient.</p>
      <p>Furthermore, it is interesting to speculate active learning with predicted images, which have low confidence levels. That is, predicted labels that have low confidence rate after final activation function, such as 0.4 to 0.6. We might consider them as mislabeled images. Therefore, using high-confidence images and their labels, we can re-label low confidence images assisted by radiologist if needed and train CNN again. This can be used as strategy for training the noisy dataset accurately. However, this strategy is beyond the scope of this study. In our future work, this kind of strategy will be used to train noisy dataset accurately.</p>
      <p>As mentioned earlier, even a 1% decrease in accuracy can have an enormous effect on a large patient group. Additionally, categorizing data into “no finding” and “abnormal” may not be ideal as this could be a direct consequence of mislabels on “no finding.” There may be other disease patterns that were not labeled, resulting in an unfair comparison of the 3 datasets with the same criteria. Furthermore, there is a statistical limitation for this study. To compare CNN models exactly, we trained models with only 20 epochs for each label noise level. For some training steps, 20 epochs did not seem sufficient for accuracy saturation. However, we used the same network with the same hyperparameters for these comparisons. For further study, multiple and repetitive training needs to be performed.</p>
      <p>In conclusion, the robustness of CAD to label noise with open datasets seems to be a result of their impurity caused by natural language processing. CNN is not robust to label noise in large-sized and complicated images. Therefore, it needs to be emphasized that clean labels and accurate datasets are a necessary condition for developing clinically relevant CAD in medicine.</p>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Dataset description of the Asan Medical Center and Seoul National University Bundang Hospital dataset.</p>
        <media xlink:href="medinform_v8i8e18089_app1.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Dataset description of the National Institutes of Health (NIH) dataset.</p>
        <media xlink:href="medinform_v8i8e18089_app2.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Dataset description of CheXpert dataset.</p>
        <media xlink:href="medinform_v8i8e18089_app3.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Distribution of 3 randomly shuffled datasets.</p>
        <media xlink:href="medinform_v8i8e18089_app4.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Dataset description for training, validation, and test sets of the Asan Medical Center (AMC) and Seoul National University Bundang Hospital (SNUBH) dataset.</p>
        <media xlink:href="medinform_v8i8e18089_app5.docx" xlink:title="DOCX File , 16 KB"/>
      </supplementary-material>
      <supplementary-material id="app6">
        <label>Multimedia Appendix 6</label>
        <p>Receiver operating characteristic (ROC) curves of corrected test datasets. Left is for the NIH dataset, right is for the CheXpert dataset. One radiologist with over a 20-year experience confirmed 200 images from each dataset twice, and models that have been trained with each label noise were used to draw ROC curves.</p>
        <media xlink:href="medinform_v8i8e18089_app6.png" xlink:title="PNG File , 210 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AMC</term>
          <def>
            <p>Asan Medical Center</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CAD</term>
          <def>
            <p>computer-aided diagnosis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CS</term>
          <def>
            <p>consolidation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CT</term>
          <def>
            <p>computed tomography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">CXR</term>
          <def>
            <p>chest x-ray</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">DICOM</term>
          <def>
            <p>Digital Imaging and Communications in Medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">IO</term>
          <def>
            <p>interstitial opacity</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MNIST</term>
          <def>
            <p>Modified National Institute of Standards and Technology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">ND</term>
          <def>
            <p>nodule</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">NIH</term>
          <def>
            <p>National Institutes of Health</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PLE</term>
          <def>
            <p>pleural effusion</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">PT</term>
          <def>
            <p>pneumothorax</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">ROC</term>
          <def>
            <p>receiver operating characteristic</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">SNUBH</term>
          <def>
            <p>Seoul National University Bundang Hospital</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This project was supported by grants of the Korea Health Technology R&#38;D Project through the Korea Health Industry Development Institute (KHIDI), funded by the Ministry of Health and Welfare, Republic of Korea (HI18C0022, HI18C2383).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>RJ conducted experiments, wrote the manuscript, and conducted visual scoring of public datasets (nonradiologist). MJ and Kyung Hwa Lee conducted visual scoring of public datasets (nonradiologist). HNN conducted visual scoring of public datasets (radiologist). SML and Kyung Hee Lee built chest x-ray datasets from Asan Medical Center and Seoul National University Bundang Hospital, respectively. JBS reified experiment instructions. As the project manager, NK contributed to manuscript editing and reified experiment instructions.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Doi</surname>
              <given-names>Kunio</given-names>
            </name>
          </person-group>
          <article-title>Computer-aided diagnosis in medical imaging: historical review, current status and future potential</article-title>
          <source>Comput Med Imaging Graph</source>
          <year>2007</year>
          <volume>31</volume>
          <issue>4-5</issue>
          <fpage>198</fpage>
          <lpage>211</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17349778"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.compmedimag.2007.02.002</pub-id>
          <pub-id pub-id-type="medline">17349778</pub-id>
          <pub-id pub-id-type="pii">S0895-6111(07)00026-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC1955762</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Computer-aided detection in chest radiography based on artificial intelligence: a survey</article-title>
          <source>BioMed Eng OnLine</source>
          <year>2018</year>
          <month>8</month>
          <day>22</day>
          <volume>17</volume>
          <issue>1</issue>
          <pub-id pub-id-type="doi">10.1186/s12938-018-0544-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rolnick</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Veit</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Belongie</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shavit</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Deep learning is robust to massive label noise</article-title>
          <source>arXiv preprint arXiv</source>
          <year>2017</year>
          <fpage>170510694</fpage>
          <lpage>2017</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beigman Klebanov</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Beigman</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>From Annotator Agreement to Noise Models</article-title>
          <source>Computational Linguistics</source>
          <year>2009</year>
          <month>12</month>
          <volume>35</volume>
          <issue>4</issue>
          <fpage>495</fpage>
          <lpage>503</lpage>
          <pub-id pub-id-type="doi">10.1162/coli.2009.35.4.35402</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Irvin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mehta</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bagul</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Langlotz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shpanskaya</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Chexnet: Radiologist-level pneumonia detection on chest x-rays with deep learning</article-title>
          <source>arXiv preprint arXiv</source>
          <year>2017</year>
          <fpage>171105225</fpage>
          <lpage>2017</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Bagheri</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Summers</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Chestx-ray8: Hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Honolulu</conf-loc>
          <fpage>2097</fpage>
          <lpage>2106</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2017.369</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gulshan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Who said what: Modeling individual labelers improves classification</article-title>
          <year>2018</year>
          <conf-name>Thirty-Second AAAI Conference on Artificial Intelligence</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>New Orleans</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K-h</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Cleannet: Transfer learning for scalable image classifier training with label noise</article-title>
          <year>2018</year>
          <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Salt Lake City</conf-loc>
          <fpage>5447</fpage>
          <lpage>5456</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChoiceNet: robust learning by revealing output correlations</article-title>
          <source>arXiv preprint arXiv</source>
          <year>2018</year>
          <fpage>180506431</fpage>
          <lpage>2018</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sukhbaatar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fergus</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Learning from noisy labels with deep neural networks</article-title>
          <source>arXiv preprint arXiv</source>
          <year>2014</year>
          <volume>2</volume>
          <issue>3</issue>
          <fpage>14062080</fpage>
          <lpage>2014</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brodley</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Friedl</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Identifying Mislabeled Training Data</article-title>
          <source>jair</source>
          <year>1999</year>
          <month>08</month>
          <day>01</day>
          <volume>11</volume>
          <fpage>131</fpage>
          <lpage>167</lpage>
          <pub-id pub-id-type="doi">10.1613/jair.606</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van Engelen</surname>
              <given-names>Je</given-names>
            </name>
            <name name-style="western">
              <surname>Hoos</surname>
              <given-names>Hh</given-names>
            </name>
          </person-group>
          <article-title>A survey on semi-supervised learning</article-title>
          <source>Mach Learn</source>
          <year>2019</year>
          <month>11</month>
          <day>15</day>
          <volume>109</volume>
          <issue>2</issue>
          <fpage>373</fpage>
          <lpage>440</lpage>
          <pub-id pub-id-type="doi">10.1007/s10994-019-05855-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bootkrajang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kabán</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Label-noise robust logistic regression and its applications</article-title>
          <year>2012</year>
          <conf-name>Joint European conference on machine learning and knowledge discovery in databases: Springer</conf-name>
          <conf-date>2012</conf-date>
          <conf-loc>Bristol</conf-loc>
          <fpage>143</fpage>
          <lpage>158</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-642-33460-3_15</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Vimalananda</surname>
              <given-names>VG</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Automatic Detection of Hypoglycemic Events From the Electronic Health Record Notes of Diabetes Patients: Empirical Study</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>11</month>
          <day>8</day>
          <volume>7</volume>
          <issue>4</issue>
          <fpage>e14340</fpage>
          <pub-id pub-id-type="doi">10.2196/14340</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>McManus</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Detection of Bleeding Events in Electronic Health Record Notes Using Convolutional Neural Network Models Enhanced With Recurrent Neural Network Autoencoders: Deep Learning Approach</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>02</month>
          <day>08</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>e10788</fpage>
          <pub-id pub-id-type="doi">10.2196/10788</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Poblenz</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Dagunts</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Covington</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bernard</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lyman</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Learning to diagnose from scratch by exploiting dependencies among labels</article-title>
          <source>arXiv preprint arXiv</source>
          <year>2018</year>
          <conf-name>Sixth International Conference on Learning Representations</conf-name>
          <conf-date>Mon Apr 30th through May 3rd, 2018</conf-date>
          <conf-loc>Vancouver</conf-loc>
          <fpage>171010501</fpage>
          <lpage>2017</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pesce</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Joseph Withey</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ypsilantis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bakewell</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Montana</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Learning to detect chest radiographs containing pulmonary lesions using visual attention networks</article-title>
          <source>Medical Image Analysis</source>
          <year>2019</year>
          <month>04</month>
          <volume>53</volume>
          <fpage>26</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1016/j.media.2018.12.007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Irvin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ko</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ciurea-Ilcus</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Marklund</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Haghgoo</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ball</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shpanskaya</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Seekins</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mong</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Halabi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Sandberg</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Larson</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Langlotz</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>BN</given-names>
            </name>
            <name name-style="western">
              <surname>Lungren</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>AY</given-names>
            </name>
          </person-group>
          <article-title>CheXpert: A Large Chest Radiograph Dataset with Uncertainty Labels and Expert Comparison</article-title>
          <source>AAAI</source>
          <year>2019</year>
          <month>07</month>
          <day>17</day>
          <volume>33</volume>
          <fpage>590</fpage>
          <lpage>597</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v33i01.3301590</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Van</surname>
              <given-names>DML</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Densely connected convolutional networks</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Hawai</conf-loc>
          <fpage>4700</fpage>
          <lpage>4708</lpage>
          <pub-id pub-id-type="doi">10.1109/CVPR.2017.243</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Adam: A method for stochastic optimization</article-title>
          <source>arXiv preprint arXiv</source>
          <year>2015</year>
          <conf-name>6th International Conference on Learning Representations</conf-name>
          <conf-date>May 7 - 9, 2015</conf-date>
          <conf-loc>San Diego</conf-loc>
          <fpage>A</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanley</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>McNeil</surname>
              <given-names>BJ</given-names>
            </name>
          </person-group>
          <article-title>The meaning and use of the area under a receiver operating characteristic (ROC) curve</article-title>
          <source>Radiology</source>
          <year>1982</year>
          <month>04</month>
          <volume>143</volume>
          <issue>1</issue>
          <fpage>29</fpage>
          <lpage>36</lpage>
          <pub-id pub-id-type="doi">10.1148/radiology.143.1.7063747</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
