<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i8e36427</article-id>
      <article-id pub-id-type="pmid">35916701</article-id>
      <article-id pub-id-type="doi">10.2196/36427</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Review</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Review</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Uncertainty Estimation in Medical Image Classification: Systematic Review</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Rezk</surname>
            <given-names>Eman</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Nneji</surname>
            <given-names>Grace</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Kurz</surname>
            <given-names>Alexander</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6175-9203</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Hauser</surname>
            <given-names>Katja</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9390-3505</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Mehrtens</surname>
            <given-names>Hendrik Alexander</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1234-5041</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Krieghoff-Henning</surname>
            <given-names>Eva</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8381-3100</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Hekler</surname>
            <given-names>Achim</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4974-2457</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Kather</surname>
            <given-names>Jakob Nikolas</given-names>
          </name>
          <degrees>Prof Dr med</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3730-5348</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Fröhling</surname>
            <given-names>Stefan</given-names>
          </name>
          <degrees>Prof Dr med</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7907-4595</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>von Kalle</surname>
            <given-names>Christof</given-names>
          </name>
          <degrees>Prof Dr med</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9221-3297</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Brinker</surname>
            <given-names>Titus Josef</given-names>
          </name>
          <degrees>Dr med</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Digital Biomarkers for Oncology Group</institution>
            <institution>German Cancer Research Center (DKFZ)</institution>
            <addr-line>Im Neuenheimer Feld 280</addr-line>
            <addr-line>Heidelberg, 69120</addr-line>
            <country>Germany</country>
            <phone>49 62213219304</phone>
            <email>titus.brinker@nct-heidelberg.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3620-5919</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Digital Biomarkers for Oncology Group</institution>
        <institution>German Cancer Research Center (DKFZ)</institution>
        <addr-line>Heidelberg</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Medicine III</institution>
        <institution>University Hospital RWTH Aachen</institution>
        <addr-line>Aachen</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Translational Medical Oncology</institution>
        <institution>National Center for Tumor Diseases (NCT)</institution>
        <institution>German Cancer Research Center (DKFZ)</institution>
        <addr-line>Heidelberg</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Clinical-Translational Sciences</institution>
        <institution>Berlin Institute of Health (BIH)</institution>
        <addr-line>Berlin</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Titus Josef Brinker <email>titus.brinker@nct-heidelberg.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>8</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>8</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>8</issue>
      <elocation-id>e36427</elocation-id>
      <history>
        <date date-type="received">
          <day>14</day>
          <month>1</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>8</day>
          <month>3</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>4</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>4</day>
          <month>6</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Alexander Kurz, Katja Hauser, Hendrik Alexander Mehrtens, Eva Krieghoff-Henning, Achim Hekler, Jakob Nikolas Kather, Stefan Fröhling, Christof von Kalle, Titus Josef Brinker. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 02.08.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/8/e36427" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Deep neural networks are showing impressive results in different medical image classification tasks. However, for real-world applications, there is a need to estimate the network’s uncertainty together with its prediction.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this review, we investigate in what form uncertainty estimation has been applied to the task of medical image classification. We also investigate which metrics are used to describe the effectiveness of the applied uncertainty estimation</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Google Scholar, PubMed, IEEE Xplore, and ScienceDirect were screened for peer-reviewed studies, published between 2016 and 2021, that deal with uncertainty estimation in medical image classification. The search terms “uncertainty,” “uncertainty estimation,” “network calibration,” and “out-of-distribution detection” were used in combination with the terms “medical images,” “medical image analysis,” and “medical image classification.”</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>A total of 22 papers were chosen for detailed analysis through the systematic review process. This paper provides a table for a systematic comparison of the included works with respect to the applied method for estimating the uncertainty.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The applied methods for estimating uncertainties are diverse, but the sampling-based methods Monte-Carlo Dropout and Deep Ensembles are used most frequently. We concluded that future works can investigate the benefits of uncertainty estimation in collaborative settings of artificial intelligence systems and human experts.</p>
        </sec>
        <sec sec-type="registered-report">
          <title>International Registered Report Identifier (IRRID)</title>
          <p>RR2-10.2196/11936</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>uncertainty estimation</kwd>
        <kwd>network calibration</kwd>
        <kwd>out-of-distribution detection</kwd>
        <kwd>medical image classification</kwd>
        <kwd>deep learning</kwd>
        <kwd>medical imaging</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Overview</title>
        <p>Digital image analysis is a helpful tool to support physicians in their clinical decision-making. Originally, digital image analysis was performed by extracting handcrafted features from an input image. These features can be tuned to the underlying data, which means that for a specific disease, only specific features can be looked for in the observed image. With the advent of deep learning, however, a “black box” has been established that can, in the setting of supervised learning, intrinsically learn such features from labeled data. In recent years, deep learning–based methods have vastly outperformed traditional methods that rely on handcrafted features. With the learning-based methods, the focus has shifted from manually defining image features to providing clean and correctly annotated data to the learning system. With the data-centric approach, however, new challenges arise.</p>
        <p>In a clinical setting, when such algorithms are meant to be used as diagnostic assistance tools, the user has to be able to understand how the artificial intelligence (AI) system came up with the diagnosis. One key component in this regard is a measure of confidence of the AI system in its prediction. Such a measure is important to increase trust in the AI system, and it may improve clinical decision-making [<xref ref-type="bibr" rid="ref1">1</xref>]. We will use the term “uncertainty estimation” for measures to evaluate model confidence. When the AI system provides a measure for its uncertainty, predictions with high uncertainties can be treated with extra care by medical experts. On the other hand, the human expert can better trust the prediction of an AI system where it reports low uncertainty. In this study, we review recent publications that have applied uncertainty estimation methods to medical image classification tasks. The area of uncertainty estimation in deep neural networks is an active research field, and the currently most popular methods have been proposed from 2016 onward. In the next section, we provide an overview of the most prominent methods for uncertainty estimation.</p>
        <p>In the results section, we categorize the reviewed works by the uncertainty estimation method they apply. We provide a table that serves as an overview of all the included studies. In the last section, we discuss the most frequently used metrics for evaluating the benefit of uncertainty estimation and give an outlook of possible future research directions with a focus on human-machine collaboration.</p>
      </sec>
      <sec>
        <title>Technical Background</title>
        <p>In a classification task, the neural network is supposed to predict how likely it is for a given input <italic>x</italic> to belong to class <italic>y</italic> out of a fixed number of possible classes. The output of the neural network can be interpreted as a probability distribution over all classes, with each individual value indicating how likely it is for the input to belong to the respective class.</p>
        <p>In formula, the predictive distribution can be written as follows:</p>
        <graphic xlink:href="medinform_v10i8e36427_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>The predictive distribution given input <italic>x</italic> and training data <italic>D</italic> is described as the integral over the likelihood <italic>p(y&#124;x,θ)</italic> with prior <italic>p(θ&#124;D)</italic> computed over the model’s parameters <italic>θ</italic>. In deep neural networks, this integral cannot be computed analytically. Therefore, methods that try to capture uncertainty in neural networks try to approximate the predictive distribution.</p>
        <p>Depending on the modeled uncertainty, the predictive uncertainty can be divided into aleatoric uncertainty and epistemic uncertainty. The aleatoric uncertainty describes the uncertainty inherent in the data, whereas the epistemic uncertainty captures the uncertainty of the model. The softmax output of a typical classification network is only able to capture aleatoric uncertainty [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      </sec>
      <sec>
        <title>Methods for Uncertainty Estimation</title>
        <p>Ovadia et al [<xref ref-type="bibr" rid="ref3">3</xref>] compared several popular methods for uncertainty estimation. In this work, we name the methods that we discovered to be most popular and refer the reader to the respective works for a detailed description of the proposed methods. We categorize the methods into (1) model sampling, (2) single network methods, and (c) data augmentation.</p>
        <sec>
          <title>Model Sampling</title>
          <p>Sampling-based methods are easy to implement as they make use of existing network architectures. The 2 most popular methods are Monte Carlo dropout (MCDO) [<xref ref-type="bibr" rid="ref4">4</xref>] and Deep Ensembles [<xref ref-type="bibr" rid="ref5">5</xref>]. Both methods rely on several prediction runs of either an ensemble of multiple neural networks or a neural network with dropout layers to compute a predictive uncertainty.</p>
        </sec>
        <sec>
          <title>Single Network Methods</title>
          <p>The field of directly modifying the network architecture for improved uncertainty estimation is quite diverse. In the derivation of MCDO, the authors compare their approach to Gaussian processes (GPs). A GP is a method to estimate a distribution over functions [<xref ref-type="bibr" rid="ref6">6</xref>] and can be applied to estimate uncertainties in neural networks.</p>
          <p>Approaches that have been included in the comparison by Ovadia et al [<xref ref-type="bibr" rid="ref3">3</xref>] include stochastic variational inference (SVI) [<xref ref-type="bibr" rid="ref7">7</xref>] and temperature scaling (TS) [<xref ref-type="bibr" rid="ref8">8</xref>]. SVI applies the concept of variational inference to deep neural networks, whereas TS works as a post hoc method. By applying a scaling factor to the network output, TS can improve network calibration. Another method worth mentioning is evidential deep learning (EDL) [<xref ref-type="bibr" rid="ref9">9</xref>]. EDL fits a Dirichlet distribution to the network output to estimate the network’s uncertainty.</p>
        </sec>
        <sec>
          <title>Data Augmentation</title>
          <p>Comparable to sampling multiple models, one can also compute a distribution of predictions by running the network on different augmentations of the input data. Ayhan and Berens [<xref ref-type="bibr" rid="ref10">10</xref>] propose such a method for improved aleatoric uncertainty estimation called test-time data augmentation (TTA).</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Extraction</title>
        <p>For the systematic review, we searched through Google Scholar, PubMed, IEEE Xplore, and ScienceDirect to identify relevant works that apply uncertainty estimation methods to medical image classification. We limited our search to works that have appeared between January 2016 and October 2021. As search terms, we used “uncertainty,” “uncertainty estimation,” “network calibration,” and “out-of-distribution detection,” and we combined them with the terms “medical images,” “medical image analysis,” and “medical image classification.”</p>
      </sec>
      <sec>
        <title>Selection Process</title>
        <p>The selection process was conducted according to the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines [<xref ref-type="bibr" rid="ref11">11</xref>]. We found 320 potentially relevant publications from the database search. During title and abstract screening, we discarded the majority of the works, as they either did not estimate uncertainties at all or dealt with other image analysis problems such as image segmentation. From the first screening round, 65 papers were selected for full-text analysis. During the full-text analysis, we discarded several other works, as they turned out to deal with other problems including semantic segmentation. Eventually, 22 papers were included in the review. <xref rid="figure1" ref-type="fig">Figure 1</xref> visualizes the selection process.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>PRISMA (Preferred Reporting Items for Systematic reviews and Meta-Analyses) flow diagram.</p>
          </caption>
          <graphic xlink:href="medinform_v10i8e36427_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Paper Categorization</title>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> provides an overview of the applied methods in all of the reviewed works. Note that most included works apply more than 1 method for uncertainty estimation. We observed that the majority of works apply sampling-based methods (ie, MCDO and Deep Ensembles). In the category that we denoted as single network methods, all corresponding methods are almost equally represented. Lastly, 4 works that we included apply TTA to compute an uncertainty estimate.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Number of publications that apply the respective uncertainty estimation method. EDL: evidential deep learning; GP: Gaussian process; MCDO: Monte Carlo dropout; SVI: stochastic variational inference; TS: temperature scaling; TTA: test-time data augmentation.</p>
          </caption>
          <graphic xlink:href="medinform_v10i8e36427_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Most of the included works evaluate the applied methods by computing an uncertainty measure (mostly predictive variance or predictive entropy). This uncertainty measure is often used to generate retained data versus accuracy evaluations. <xref rid="figure3" ref-type="fig">Figure 3</xref> shows an example of retained data versus accuracy plot from the study by Filos et al [<xref ref-type="bibr" rid="ref2">2</xref>]. From the plot, it can be observed that when only the more certain samples are retained, accuracy on the retained data increases. The methods for uncertainty estimation are then ranked by how far they increase the accuracy on the retained data.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Retained data versus accuracy plot from Filos et al [<xref ref-type="bibr" rid="ref2">2</xref>]. MFVI: mean field variational inference.</p>
          </caption>
          <graphic xlink:href="medinform_v10i8e36427_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Some included works focus on network calibration and try to decrease the expected calibration error (ECE) within their experiments. Some other works use the computed uncertainty measure to detect out-of-distribution (OOD) samples. <xref ref-type="table" rid="table1">Table 1</xref> provides an overview of all included works. In the following sections, we will briefly cover the content of each included study.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Overview of the selected studies.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="140"/>
            <col width="140"/>
            <col width="120"/>
            <col width="130"/>
            <col width="150"/>
            <col width="110"/>
            <col width="80"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td>Methods</td>
                <td>Organs or sickness</td>
                <td>Sensor</td>
                <td>Network architecture</td>
                <td>Reported metrics</td>
                <td>Data access</td>
                <td>Code available</td>
                <td>Reference</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MCDO<sup>a</sup>, GP<sup>b</sup></td>
                <td>Diabetic retinopathy from fundus images</td>
                <td>Camera</td>
                <td>Custom CNNs<sup>c</sup></td>
                <td>Retained data or accuracy, uncertainty or density</td>
                <td>Public (Kaggle competition)</td>
                <td>Yes</td>
                <td>Leibig et al [<xref ref-type="bibr" rid="ref12">12</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, SVI<sup>d</sup></td>
                <td>Retina</td>
                <td>Optical coherence tomography</td>
                <td>ResNet-18</td>
                <td>Predictive variance</td>
                <td>Public</td>
                <td>Yes</td>
                <td>Laves et al [<xref ref-type="bibr" rid="ref13">13</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO</td>
                <td>Skin cancer</td>
                <td>Camera</td>
                <td>VGG-16, ResNet-50, DenseNet-169</td>
                <td>Uncertainty or density, retained data or accuracy, uncertainty, confusion matrix</td>
                <td>Public</td>
                <td>Yes</td>
                <td>Mobiny et al [<xref ref-type="bibr" rid="ref14">14</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO</td>
                <td>Brain</td>
                <td>MRI<sup>e</sup></td>
                <td>Modified VGGNet</td>
                <td>Reliability diagrams, AUROC<sup>f</sup></td>
                <td>Private</td>
                <td>Yes</td>
                <td>Herzog et al [<xref ref-type="bibr" rid="ref15">15</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO</td>
                <td>Breast cancer</td>
                <td>Mammography</td>
                <td>VGG-19</td>
                <td>Uncertainty, confusion matrix</td>
                <td>Public</td>
                <td>No</td>
                <td>Caldéron-Ramírez et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, DUQ<sup>g</sup></td>
                <td>COVID-19</td>
                <td>X-ray</td>
                <td>WideResNet</td>
                <td>Jensen-Shannon divergence</td>
                <td>Public</td>
                <td>No</td>
                <td>Caldéron-Ramírez et al [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, Ensembles, MFVI<sup>h</sup></td>
                <td>Diabetic retinopathy from fundus images</td>
                <td>Camera</td>
                <td>VGG Variants</td>
                <td>Retained data or accuracy, retained data or AUROC, ROC<sup>i</sup></td>
                <td>Public (Kaggle competition)</td>
                <td>Yes</td>
                <td>Filos et al [<xref ref-type="bibr" rid="ref2">2</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, Ensembles, M-heads</td>
                <td>Histopathological slides</td>
                <td>Microscope</td>
                <td>DenseNet</td>
                <td>Retained data or AUROC</td>
                <td>Public</td>
                <td>No</td>
                <td>Linmans et al [<xref ref-type="bibr" rid="ref18">18</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, Ensembles, Mix-up</td>
                <td>Histopathological slides</td>
                <td>Microscope</td>
                <td>ResNet-50</td>
                <td>ECE<sup>j</sup>, AUROC, AUPRC<sup>k</sup></td>
                <td>Private</td>
                <td>No</td>
                <td>Thagaard et al [<xref ref-type="bibr" rid="ref19">19</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, Ensembles</td>
                <td>COVID-19, Histopathological slides (breast cancer)</td>
                <td>CT<sup>l</sup>, microscope</td>
                <td>ResNet-152-V2, Inception-V3, Inception-ResNet-V2</td>
                <td>Predictive entropy, retained data or accuracy</td>
                <td>Public</td>
                <td>No</td>
                <td>Yang and Fevens [<xref ref-type="bibr" rid="ref20">20</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, Ensembles, TWD<sup>m</sup></td>
                <td>Skin cancer</td>
                <td>Camera</td>
                <td>ResNet-152, Inception- ResNet-V2, DenseNet-201, MobileNet-V2</td>
                <td>Entropy, AUROC</td>
                <td>Public (Kaggle competition, ISIC data set)</td>
                <td>No</td>
                <td>Abdar et al [<xref ref-type="bibr" rid="ref21">21</xref>]</td>
              </tr>
              <tr valign="top">
                <td>MCDO, Ensembles, others</td>
                <td>Lung</td>
                <td>X-ray</td>
                <td>WideResNet</td>
                <td>AUROC, AUPRC</td>
                <td>Public</td>
                <td>No</td>
                <td>Berger et al [<xref ref-type="bibr" rid="ref22">22</xref>]</td>
              </tr>
              <tr valign="top">
                <td>GP</td>
                <td>Diabetic retinopathy from fundus images</td>
                <td>Camera</td>
                <td>Inception-V3</td>
                <td>AUROC</td>
                <td>Public (Kaggle competition)</td>
                <td>Yes</td>
                <td>Toledo-Cortés et al [<xref ref-type="bibr" rid="ref23">23</xref>]</td>
              </tr>
              <tr valign="top">
                <td>EDL<sup>n</sup> + Ensembles</td>
                <td>Chest</td>
                <td>X-ray</td>
                <td>DenseNet-121</td>
                <td>AUROC</td>
                <td>Public</td>
                <td>No</td>
                <td>Ghesu et al [<xref ref-type="bibr" rid="ref24">24</xref>]</td>
              </tr>
              <tr valign="top">
                <td>EDL + MCDO</td>
                <td>Breast cancer</td>
                <td>Mammography</td>
                <td>VGGNet</td>
                <td>AUROC</td>
                <td>Public + private</td>
                <td>No</td>
                <td>Tardy et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td>
              </tr>
              <tr valign="top">
                <td>EDL</td>
                <td>Chest, abdomen, and brain</td>
                <td>X-ray, ultrasound, MRI</td>
                <td>DenseNet-121</td>
                <td>AUROC, coverage or F1 score, coverage or AUROC</td>
                <td>Public</td>
                <td>No</td>
                <td>Ghesu et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td>
              </tr>
              <tr valign="top">
                <td>TS<sup>o</sup>, MCDO</td>
                <td>Polyp</td>
                <td>Colonoscopy (camera)</td>
                <td>ResNet-101, DenseNet-121</td>
                <td>ECE, predictive entropy, predictive variance</td>
                <td>Public + private</td>
                <td>No</td>
                <td>Carneiro et al [<xref ref-type="bibr" rid="ref27">27</xref>]</td>
              </tr>
              <tr valign="top">
                <td>TS, DCA<sup>p</sup></td>
                <td>Head CT, mammography, chest x-ray, histology</td>
                <td>Multimodal</td>
                <td>AlexNet,<break/>ResNet-50,<break/>DenseNet-121,<break/>SqueezeNet</td>
                <td>ECE</td>
                <td>Public</td>
                <td>No</td>
                <td>Liang et al [<xref ref-type="bibr" rid="ref28">28</xref>]</td>
              </tr>
              <tr valign="top">
                <td>TTA<sup>q</sup></td>
                <td>Diabetic retinopathy from fundus images</td>
                <td>Camera</td>
                <td>ResNet-50</td>
                <td>Uncertainty or density, retained data or AUROC</td>
                <td>Public (Kaggle competition)</td>
                <td>Yes</td>
                <td>Ayhan and Berens [<xref ref-type="bibr" rid="ref10">10</xref>]</td>
              </tr>
              <tr valign="top">
                <td>TTA,<break/>MCDO,<break/>MCBN<sup>r</sup>,<break/>Ensembles</td>
                <td>Skin cancer</td>
                <td>Camera</td>
                <td>ResNet-50</td>
                <td>ECE</td>
                <td>Private (31,000 annotated images)</td>
                <td>No</td>
                <td>Jensen et al [<xref ref-type="bibr" rid="ref29">29</xref>]</td>
              </tr>
              <tr valign="top">
                <td>TTA + MCDO</td>
                <td>Skin cancer</td>
                <td>Camera</td>
                <td>Efficient-Net-B0</td>
                <td>Predictive entropy, predictive variance, Bhattacharya coefficient, retained data or accuracy</td>
                <td>Public (ISIC data set)</td>
                <td>No</td>
                <td>Combalia et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td>
              </tr>
              <tr valign="top">
                <td>TTA, TS, Ensembles</td>
                <td>Diabetic retinopathy from fundus images</td>
                <td>Camera</td>
                <td>Modified ResNet</td>
                <td>Reliability diagrams, AECE<sup>s</sup>, retained data or AUROC</td>
                <td>Public (Kaggle competition)</td>
                <td>Yes</td>
                <td>Ayhan et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>MCDO: Monte Carlo dropout.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>GP: Gaussian process.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>SVI: stochastic variational inference.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>MRI: magnetic resonance imaging.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>AUROC: area under the receiver operating curve.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>DUQ: deterministic uncertainty quantification.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>MFVI: mean field variational inference.</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>ROC: receiver operating curve.</p>
            </fn>
            <fn id="table1fn10">
              <p><sup>j</sup>ECE: expected calibration error.</p>
            </fn>
            <fn id="table1fn11">
              <p><sup>k</sup>AUPRC: area under the precision recall curve.</p>
            </fn>
            <fn id="table1fn12">
              <p><sup>l</sup>CT: computed tomography.</p>
            </fn>
            <fn id="table1fn13">
              <p><sup>m</sup>TWD: three-way decision theory.</p>
            </fn>
            <fn id="table1fn14">
              <p><sup>n</sup>EDL: evidential deep learning.</p>
            </fn>
            <fn id="table1fn15">
              <p><sup>o</sup>TS: temperature scaling.</p>
            </fn>
            <fn id="table1fn16">
              <p><sup>p</sup>DCA: difference between confidence and accuracy.</p>
            </fn>
            <fn id="table1fn17">
              <p><sup>q</sup>TTA: test-time data augmentation.</p>
            </fn>
            <fn id="table1fn18">
              <p><sup>r</sup>MCBN: Monte-Carlo batch norm.</p>
            </fn>
            <fn id="table1fn19">
              <p><sup>s</sup>AECE: adaptive expected calibration error.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Sampling-Based Methods</title>
        <p>The first work that we have included is the study by Leibig et al [<xref ref-type="bibr" rid="ref12">12</xref>], which applies MCDO to the task of diabetic retinopathy classification. To evaluate the impact of the applied uncertainty estimation method, the authors report retained data versus accuracy curves. This means that a fraction of uncertain predictions is discarded, and it is evaluated how discarding uncertain samples can improve the accuracy on the test data set. The results show that discarding 20% or more of the most uncertain samples can notably improve the accuracy of the neural network. In their work, the authors compare the performance of MCDO to an alternatively implemented GP and find that MCDO leads to better accuracies on the retained data versus accuracy evaluations.</p>
        <p>Laves et al [<xref ref-type="bibr" rid="ref13">13</xref>] apply MCDO and SVI to retina scans observed through optical coherence tomography. The authors show that both methods lead to higher standard deviations on false-positive predictions compared to true positive predictions. This indicates that the standard deviations can be used to refer predictions with high uncertainty to human experts to improve the classification accuracy.</p>
        <p>Mobiny et al [<xref ref-type="bibr" rid="ref14">14</xref>] estimate uncertainties using MCDO with different types of networks including VGGNet [<xref ref-type="bibr" rid="ref32">32</xref>], ResNet [<xref ref-type="bibr" rid="ref33">33</xref>], and DenseNet [<xref ref-type="bibr" rid="ref34">34</xref>] on dermoscopic images of 8 different skin lesion types. Similar to Leibig et al [<xref ref-type="bibr" rid="ref12">12</xref>], the authors report retained data versus accuracy curves and show that the accuracy can be increased when referring a fraction of uncertain samples to a medical expert. As a measure for uncertainty, the normalized predictive entropy is computed. As an additional metric, the authors also compute an uncertainty-related confusion matrix that includes the numbers of correct-certain, correct-uncertain, incorrect-certain, and incorrect-uncertain predictions. The respective numbers vary when the uncertainty threshold is changed. One possible goal with this evaluation is to decrease the number of incorrect-certain predictions as much as possible.</p>
        <p>Another work by Herzog et al [<xref ref-type="bibr" rid="ref15">15</xref>] applies MCDO to the classification of brain magnetic resonance imaging (MRI) images. The goal of their work is to infer patient-level diagnostics from the predictions from multiple images. Therefore, the authors compute a variety of 5 uncertainty measures per image. To draw conclusions on a patient level, the authors run another neural network that processes the uncertainties of all images belonging to one patient.</p>
        <p>In two other published works, Caldéron-Ramírez et al [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>] apply MCDO to the tasks of breast cancer classification from mammography images and to COVID-19 classification from chest x-ray scans. Unfortunately, even among the two works, the authors report different metrics, which prevents comparing the results. In the breast cancer classification task, the authors use a metric called uncertainty balanced accuracy, which builds up on the uncertainty-related confusion matrix also used by Mobiny et al [<xref ref-type="bibr" rid="ref14">14</xref>]. In the work related to COVID-19 detection, the authors report the Jensen-Shannon divergence as an uncertainty measure, which we did not encounter in any of the other reviewed works.</p>
        <p>Another set of studies compared MCDO to Deep Ensembles (further simply denoted as Ensembles) and partly to other methods. Filos et al [<xref ref-type="bibr" rid="ref2">2</xref>] compare MCDO to Ensembles and mean field variational inference (MFVI), which is a variation of SVI, and apply it to the task of diabetic retinopathy classification. In addition to comparing MCDO and Ensembles individually, they also combine both approaches and include the combination in the evaluation, denoted as “Ensemble MCDO.” As neural network architecture, the authors use variants of VGGNet [<xref ref-type="bibr" rid="ref32">32</xref>]. The retained data versus accuracy plots show that “Ensemble MCDO” leads to the best performance, followed by MCDO and Ensembles applied individually. MFVI did not achieve the same performance as the sampling-based methods.</p>
        <p>Linmans et al [<xref ref-type="bibr" rid="ref18">18</xref>] perform uncertainty estimation on the publicly available Camelyon data sets for breast cancer detection on histopathological slides. The authors propose a new method for uncertainty estimation called “M-heads,” which adds multiple output heads to the convolutional neural network (CNN). They compare their method to the MCDO and Ensembles of 5 and 10 networks, respectively. From the different evaluations, the confidence versus accuracy plot shows that accuracy increases when only keeping predictions with high confidence. The methods rank from M-heads performing best, followed by the Ensembles of 5 and 10 networks. In the reported results, MCDO does not perform better than the vanilla softmax output.</p>
        <p>Thagaard et al [<xref ref-type="bibr" rid="ref19">19</xref>] apply Ensembles and MCDO to private data sets of histopathological slides for breast cancer detection. In their work, the authors focus on OOD detection while analyzing combinations of different internal data sets. Concerning the comparison of the uncertainty estimation methods, the ECE is calculated on 3 different data sets. For all 3 data sets, the Ensemble of 5 ResNet-50 networks reaches the best ECE scores.</p>
        <p>In another work, Yang and Fevens [<xref ref-type="bibr" rid="ref20">20</xref>] apply MCDO, Ensembles, and a combination of both to several publicly available data sets. The modalities include COVID-19 classification from x-ray scans, brain tumor classification from MRI images, and breast cancer detection from histopathological slides. On the histopathological images, the authors present retained data versus accuracy plots. For the reported accuracies, the Ensemble MCDO approach with 5 Inception-ResNet networks leads to the best results.</p>
        <p>Abdar et al [<xref ref-type="bibr" rid="ref21">21</xref>] apply MCDO, Ensembles, and Ensemble MCDO to skin cancer classification from dermoscopic images. The authors report entropies and standard deviations of the applied methods for 4 different network architectures on 2 different publicly available data sets. From the reported values, the authors conclude that the Ensembles overall perform best. In an additional setup, the authors combine 2 uncertainty estimation methods (Ensembles and Ensembles MCDO) in a decision tree that they refer to as 3-way decision theory.</p>
        <p>In another work, Berger et al [<xref ref-type="bibr" rid="ref22">22</xref>] evaluate confidence-based OOD detection on x-ray scans of lung diseases. The authors compare MCDO, Ensembles, and specific methods for OOD detection, including a method based on Mahalanobis distance and the “out-of-distribution detector for neural networks” [<xref ref-type="bibr" rid="ref35">35</xref>]. In their experiments, the authors find that the OOD detector for neural networks leads to the best results for OOD detection with respect to the area under the receiver operating curve (AUROC) and area under the precision recall curve (AUPRC) values.</p>
      </sec>
      <sec>
        <title>Single Network Methods</title>
        <p>After having covered several works that focus on sampling-based uncertainty estimation methods, we will now look into works that directly apply to the network’s classification output to estimate uncertainties. One example is the work by Toledo-Cortés et al [<xref ref-type="bibr" rid="ref23">23</xref>] that applies a GP to the output of their implemented Inception-V3 network [<xref ref-type="bibr" rid="ref36">36</xref>]. Similar to Laves et al [<xref ref-type="bibr" rid="ref13">13</xref>], the authors report standard deviations on true positive and false positive predictions. Since the standard deviations for both cases are quite similar, it must be concluded that the applied GP is not well suited for a useful uncertainty estimation.</p>
        <p>A set of other works applies EDL to estimate uncertainties. In their first work, Ghesu et al [<xref ref-type="bibr" rid="ref24">24</xref>] work with x-ray scans of the chest and later extend their approach to ultrasound images of the abdomen and MRI images of the brain [<xref ref-type="bibr" rid="ref26">26</xref>]. The results show that discarding a fraction of the most uncertain predictions can notably improve the AUROC score averaged over different x-ray classification tasks.</p>
        <p>Comparably, Tardy et al [<xref ref-type="bibr" rid="ref25">25</xref>] apply EDL to the task of breast cancer classification from mammography images. The authors also report improved AUROC and AUPRC values when discarding a fraction of uncertain samples.</p>
        <p>Two works that we have included apply TS to medical image classification tasks. Carneiro et al [<xref ref-type="bibr" rid="ref27">27</xref>] combine TS and MCDO to compute a calibrated confidence measure as well as an uncertainty measure in the form of predictive entropy and predictive variance. The authors evaluate the methods on 2 different cohorts of colonoscopy images with respect to a 5-class polyp classification task. The reported ECE and accuracy values show that the DenseNet-121 architecture with both MCDO and TS leads to the best results.</p>
        <p>Liang et al [<xref ref-type="bibr" rid="ref28">28</xref>] present a new approach for network calibration in the form of an auxiliary loss term called “difference between confidence and accuracy” (DCA) that can be integrated into an existing CNN training procedure. The authors compare their approach to TS and uncalibrated networks on different medical data sets with several different network architectures. The results show that in most cases, DCA produces the best ECE values. It is also shown that depending on the data set and model architecture, TS does not always improve the expected calibration error.</p>
      </sec>
      <sec>
        <title>Test-Time Data Augmentation (TTA)</title>
        <p>The concept of TTA is introduced by Ayhan and Behrens [<xref ref-type="bibr" rid="ref10">10</xref>], where it is applied to the task of diabetic retinopathy from fundus images. The authors apply 128 different augmentations, ranging from cropping and resizing to different color augmentations. As measure for uncertainty, the interquartile range of the predictions is computed. Similar to Leibig et al [<xref ref-type="bibr" rid="ref12">12</xref>], the authors report retained data versus AUROC curves and show that the AUROC values improve when referring uncertain samples to a medical expert.</p>
        <p>Another work by Jensen et al [<xref ref-type="bibr" rid="ref29">29</xref>] focuses on evaluating interrater agreement on dermoscopic images of different skin lesions. In the experiment, multiple experts have provided labels for the respective images, and the labels for each sample can vary across experts. Therefore, the approaches of label fusion and label sampling are compared for training the neural network. These approaches are combined with methods that estimate uncertainties to evaluate the influence on the network’s calibration of the combined methods. It is shown that in the specific experimental setting, the combination of label sampling and TTA leads to the highest classification accuracies among all data splits.</p>
        <p>Combalia et al [<xref ref-type="bibr" rid="ref30">30</xref>], also working with dermoscopic images, combine TTA and MCDO to evaluate aleatoric as well as epistemic uncertainties. In their experiments, the authors show that the combination of both methods leads to the best results for OOD detection as well as on the retained data versus accuracy evaluation. For the evaluations, 100 forward passes through the network are performed with either TTA or MCDO or both methods combined. The uncertainties are quantified by computing the predictive entropy, the predictive variance, and additionally, the Bhattacharyya coefficient [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
        <p>In a follow-up of their original work, Ayhan et al [<xref ref-type="bibr" rid="ref31">31</xref>] extend their experiments on diabetic retinopathy classification by other uncertainty estimation methods. Besides TTA, the authors also include TS and an ensemble of 3 modified ResNet networks. To compare the results, the authors compute The Adaptive Expected Calibration Error [<xref ref-type="bibr" rid="ref37">37</xref>]. In terms of Adaptive Expected Calibration Error, the median probability of 128 forward passes with different data augmentations leads to the best calibrated results. On the retained data versus AUROC curves, TTA and Deep Ensembles perform equally well. The experiments on a different cohort of fundus images show that TS generalizes worse to new data compared to TTA and Deep Ensembles.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>Through the reviewed publications, we gained an overview of which methods for uncertainty estimation are most frequently used in the field of medical image classification. We found that the sampling-based methods MCDO and Deep Ensembles are the most frequently applied methods. With the sampling-based approaches, it is possible to compute a distribution of predictions and from there determine an uncertainty measure, usually either in the form of predictive entropy or predictive variance. These measures help to identify samples where the neural network is uncertain about its predictions.</p>
      <p>In addition to the sampling-based uncertainty evaluations, we also observed evaluations that analyze the calibration of the neural network. The calibration evaluations in terms of reliability diagrams and ECE are used to determine if the neural network’s output probabilities represent the actual likelihood of the prediction being correct. In the original paper on neural network calibration [<xref ref-type="bibr" rid="ref8">8</xref>], the authors claim that most modern CNNs are not well calibrated and produce overconfident predictions. In this review, we saw that several methods including TS and TTA can be applied to improve calibration [<xref ref-type="bibr" rid="ref31">31</xref>].</p>
      <p>Another observation we made is that combining uncertainty estimation methods can improve the results. This holds for combinations of Ensembles and MCDO [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], TS and MCDO [<xref ref-type="bibr" rid="ref27">27</xref>], or TTA and MCDO [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
      <p>By presenting retained data versus accuracy curves, several works [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref30">30</xref>] show that discarding uncertain predictions leads to an improved accuracy of the neural network on the remaining samples. This insight holds for all 3 categories of uncertainty estimation methods that we denoted as (1) model sampling, (2) single network methods, and (3) data augmentation. An important message from this observation is that uncertainty estimation can be used as a tool to improve the collaboration between AI systems and human experts. Thus far, all studies were performed in very artificial settings. Future work should therefore analyze the performance improvement of a collaboration between an uncertainty-aware AI system and human experts in scenarios that are closer to real-life situations in clinics.</p>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUPRC</term>
          <def>
            <p>area under the precision recall curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUROC</term>
          <def>
            <p>area under the receiver operating curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DCA</term>
          <def>
            <p>difference between confidence and accuracy</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ECE</term>
          <def>
            <p>expected calibration error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">EDL</term>
          <def>
            <p>evidential deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">GP</term>
          <def>
            <p>Gaussian process</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MCDO</term>
          <def>
            <p>Monte Carlo dropout</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MFVI</term>
          <def>
            <p>mean field variational inference</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">MRI</term>
          <def>
            <p>magnetic resonance imaging</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">OOD</term>
          <def>
            <p>out-of-distribution</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PRISMA</term>
          <def>
            <p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SVI</term>
          <def>
            <p>stochastic variational inference</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">TS</term>
          <def>
            <p>temperature scaling</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">TTA</term>
          <def>
            <p>test-time data augmentation</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The research is funded by the Ministerium für Soziales und Integration Baden Württemberg, Germany.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>AK, AH, and TJB are responsible for concept and design. AK and KH did the study selection. HM, EKH, JNK, SF, and CvK critically revised the manuscript and provided valuable feedback.</p>
      </fn>
      <fn fn-type="conflict">
        <p>TJB is the owner of Smart Health Heidelberg GmbH (Handschuhsheimer Landstr. 9/1, 69120 Heidelberg, Germany, https://smarthealth.de) which develops telemedicine mobile apps (such as AppDoc; https://online-hautarzt.net and Intimarzt; https://intimarzt.de), outside of the submitted work.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Begoli</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bhattacharya</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kusnezov</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The need for uncertainty quantification in machine-assisted medical decision making</article-title>
          <source>Nat Mach Intell</source>
          <year>2019</year>
          <month>1</month>
          <day>7</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>20</fpage>
          <lpage>23</lpage>
          <pub-id pub-id-type="doi">10.1038/s42256-018-0004-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Filos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Farquhar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rudner</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kenton</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Alizadeh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>de</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Gal</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>A systematic comparison of Bayesian deep learning robustness in diabetic retinopathy tasks</article-title>
          <year>2019</year>
          <conf-name>Conference on Neural Information Processing Systems (NeurIPS)</conf-name>
          <conf-date>Dec 8-14</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1912.10481.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ovadia</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fertig</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nado</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Sculley</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nowozin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dillon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lakshminarayanan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Snoek</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Can you trust your model’s uncertainty? Evaluating predictive uncertainty under dataset shift</article-title>
          <year>2019</year>
          <conf-name>Annual Conference on Neural Information Processing Systems (NeurIPS)</conf-name>
          <conf-date>Dec 8-14</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1906.02530.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gal</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ghahramani</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Dropout as a Bayesian approximation: representing model uncertainty in deep learning</article-title>
          <year>2016</year>
          <conf-name>International Conference on Machine Learning (ICML)</conf-name>
          <conf-date>June 19-24</conf-date>
          <conf-loc>New York</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1506.02142.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lakshminarayanan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Pritzel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Blundell</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Simple and scalable predictive uncertainty estimation using deep ensembles</article-title>
          <year>2017</year>
          <conf-name>Annual Conference on Neural Information Processing Systems (NeurIPS)</conf-name>
          <conf-date>Dec 4-9</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper/2017/file/9ef2ed4b7fd2c810847ffa5fa85bce38-Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rasmussen</surname>
              <given-names>CE</given-names>
            </name>
          </person-group>
          <article-title>Gaussian processes in machine learning</article-title>
          <source>Advanced Lectures on Machine Learning</source>
          <year>2003</year>
          <publisher-loc>Heidelberg</publisher-loc>
          <publisher-name>Springer Berlin</publisher-name>
          <fpage>63</fpage>
          <lpage>71</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blundell</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cornebise</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kavukcuoglu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wierstra</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Weight uncertainty in neural networks</article-title>
          <source>Proceedings of the 32nd International Conference on Machine Learning</source>
          <year>2015</year>
          <conf-name>PMLR</conf-name>
          <conf-date>July 7-9</conf-date>
          <conf-loc>Lille, France</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://proceedings.mlr.press/v37/blundell15.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Pleiss</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>On calibration of modern neural networks</article-title>
          <source>arXiv. Preprint posted online June 14, 2017</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1706.04599.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1706.04599</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sensoy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kandemir</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Evidential deep learning to quantify classification uncertainty</article-title>
          <year>2018</year>
          <conf-name>Annual Conference on Neural Information Processing Systems (NeurIPS)</conf-name>
          <conf-date>Dec 2-8</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1806.01768.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayhan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Berens</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Test-time data augmentation for estimation of heteroscedastic aleatoric uncertainty in deep neural networks</article-title>
          <year>2018</year>
          <conf-name>MIDL 2018</conf-name>
          <conf-date>July 4-6</conf-date>
          <conf-loc>Amsterdam, the Netherlands</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/pdf?id=rJZz-knjz"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Page</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>McKenzie</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Boutron</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffmann</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Mulrow</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Shamseer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tetzlaff</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Akl</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Brennan</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Glanville</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Grimshaw</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Hróbjartsson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lalu</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Loder</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Mayo-Wilson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McGuinness</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tricco</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Welch</surname>
              <given-names>VA</given-names>
            </name>
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The PRISMA 2020 statement: an updated guideline for reporting systematic reviews</article-title>
          <source>PLoS Med</source>
          <year>2021</year>
          <month>03</month>
          <volume>18</volume>
          <issue>3</issue>
          <fpage>e1003583</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pmed.1003583"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pmed.1003583</pub-id>
          <pub-id pub-id-type="medline">33780438</pub-id>
          <pub-id pub-id-type="pii">PMEDICINE-D-21-01172</pub-id>
          <pub-id pub-id-type="pmcid">PMC8007028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Leibig</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Allken</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ayhan</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Berens</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wahl</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Leveraging uncertainty information from deep neural networks for disease detection</article-title>
          <source>Sci Rep</source>
          <year>2017</year>
          <month>12</month>
          <day>19</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>17816</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-017-17876-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-017-17876-z</pub-id>
          <pub-id pub-id-type="medline">29259224</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-017-17876-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC5736701</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Laves</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ihler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ortmaier</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Uncertainty quantification in computer-aided diagnosis: make your model say “I don’t know” for ambiguous cases</article-title>
          <year>2019</year>
          <conf-name>Conference on Medical Imaging with Deep Learning (MIDL)</conf-name>
          <conf-date>July 8-10</conf-date>
          <conf-loc>London, UK</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/pdf?id=rJevPsX854"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mobiny</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Van Nguyen</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Risk-aware machine learning classifier for skin lesion diagnosis</article-title>
          <source>J Clin Med</source>
          <year>2019</year>
          <month>08</month>
          <day>17</day>
          <volume>8</volume>
          <issue>8</issue>
          <fpage>1241</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=jcm8081241"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/jcm8081241</pub-id>
          <pub-id pub-id-type="medline">31426482</pub-id>
          <pub-id pub-id-type="pii">jcm8081241</pub-id>
          <pub-id pub-id-type="pmcid">PMC6723257</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Herzog</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Murina</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Dürr</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Wegener</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sick</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Integrating uncertainty in deep neural networks for MRI based stroke analysis</article-title>
          <source>Med Image Anal</source>
          <year>2020</year>
          <month>10</month>
          <volume>65</volume>
          <fpage>101790</fpage>
          <pub-id pub-id-type="doi">10.1016/j.media.2020.101790</pub-id>
          <pub-id pub-id-type="medline">32801096</pub-id>
          <pub-id pub-id-type="pii">S1361-8415(20)30154-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Calderón-Ramírez</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Murillo-Hernández</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rojas-Salazar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Molina-Cabello</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Improving uncertainty estimations for mammogram classification using semi-supervised learning</article-title>
          <year>2021</year>
          <conf-name>International Joint Conference on Neural Networks (IJCNN)</conf-name>
          <conf-date>July 18-22</conf-date>
          <conf-loc>Shenzhen, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/ijcnn52387.2021.9533719</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Calderon-Ramirez</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Moemeni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Colreavy-Donnelly</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Elizondo</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Oala</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez-Capitan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jimenez-Navarro</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez-Rubio</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Molina-Cabello</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Improving uncertainty estimation with semi-supervised deep learning for COVID-19 detection using chest X-ray images</article-title>
          <source>IEEE Access</source>
          <year>2021</year>
          <volume>9</volume>
          <fpage>85442</fpage>
          <lpage>85454</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2021.3085418</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Linmans</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van</surname>
              <given-names>DLJ</given-names>
            </name>
            <name name-style="western">
              <surname>Litjens</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Efficient out-of-distribution detection in digital pathology using multi-head convolutional neural networks</article-title>
          <year>2020</year>
          <conf-name>Medical Imaging with Deep Learning (MIDL)</conf-name>
          <conf-date>July 6-9</conf-date>
          <conf-loc>Montreal, Canada</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://geertlitjens.nl/publication/linm-20/linm-20.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thagaard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hauberg</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>van</surname>
              <given-names>DVB</given-names>
            </name>
            <name name-style="western">
              <surname>Ebstrup</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dahl</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Can you trust predictive uncertainty under real dataset shifts in digital pathology?</article-title>
          <source>Medical Image Computing and Computer Assisted Intervention</source>
          <year>2020</year>
          <conf-name>MICCAI</conf-name>
          <conf-date>Oct 4-8</conf-date>
          <conf-loc>Lima, Peru</conf-loc>
          <fpage>824</fpage>
          <lpage>833</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-59710-8_80</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fevens</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Uncertainty quantification and estimation in medical image classification</article-title>
          <source>Artificial Neural Networks and Machine Learning</source>
          <year>2021</year>
          <conf-name>ICANN 2021</conf-name>
          <conf-date>Sep 14-17</conf-date>
          <conf-loc>Bratislava, Slovakia</conf-loc>
          <fpage>671</fpage>
          <lpage>683</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-86365-4_54</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Samami</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dehghani Mahmoodabad</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Doan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mazoure</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hashemifesharaki</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Khosravi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Acharya</surname>
              <given-names>UR</given-names>
            </name>
            <name name-style="western">
              <surname>Makarenkov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Nahavandi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Uncertainty quantification in skin cancer classification using three-way decision-based Bayesian deep learning</article-title>
          <source>Comput Biol Med</source>
          <year>2021</year>
          <month>08</month>
          <volume>135</volume>
          <fpage>104418</fpage>
          <pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.104418</pub-id>
          <pub-id pub-id-type="medline">34052016</pub-id>
          <pub-id pub-id-type="pii">S0010-4825(21)00212-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berger</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Paschali</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Glocker</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kamnitsas</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Confidence-based out-of-distribution detection: a comparative study and analysis</article-title>
          <source>Uncertainty for Safe Utilization of Machine Learning in Medical Imaging, and Perinatal Imaging, Placental and Preterm Image Analysis</source>
          <year>2021</year>
          <conf-name>UNSURE 2021</conf-name>
          <conf-date>Oct 1</conf-date>
          <conf-loc>Strasbourg, Farnce</conf-loc>
          <fpage>122</fpage>
          <lpage>132</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-87735-4_12</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Toledo-Cortés</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>de</surname>
              <given-names>LPM</given-names>
            </name>
            <name name-style="western">
              <surname>Perdomo</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>González</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Hybrid deep learning Gaussian process for diabetic retinopathy diagnosis and uncertainty quantification</article-title>
          <source>Ophthalmic Medical Image Analysis</source>
          <year>2020</year>
          <conf-name>OMIA 2020</conf-name>
          <conf-date>Oct 8</conf-date>
          <conf-loc>Lima, Peru</conf-loc>
          <fpage>206</fpage>
          <lpage>215</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-63419-3_21</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ghesu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Georgescu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Guendel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kalra</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Digumarthy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Grbic</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Comaniciu</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Quantifying and leveraging classification uncertainty for chest radiograph assessment</article-title>
          <source>Medical Image Computing and Computer Assisted Intervention</source>
          <year>2019</year>
          <conf-name>MICCAI</conf-name>
          <conf-date>Oct 13-17</conf-date>
          <conf-loc>Shenzhen, China</conf-loc>
          <fpage>676</fpage>
          <lpage>684</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-32226-7_75</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tardy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Scheffer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mateus</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Uncertainty measurements for the reliable classification of mammograms</article-title>
          <source>Medical Image Computing and Computer Assisted Intervention</source>
          <year>2019</year>
          <conf-name>MICCAI</conf-name>
          <conf-date>Oct 13-17</conf-date>
          <conf-loc>Shenzhen, China</conf-loc>
          <fpage>495</fpage>
          <lpage>503</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-32226-7_55</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ghesu</surname>
              <given-names>FC</given-names>
            </name>
            <name name-style="western">
              <surname>Georgescu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mansoor</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yoo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Vishwanath</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Balachandran</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Balter</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Digumarthy</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Kalra</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Grbic</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Comaniciu</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Quantifying and leveraging predictive uncertainty for medical image assessment</article-title>
          <source>Med Image Anal</source>
          <year>2021</year>
          <month>02</month>
          <volume>68</volume>
          <fpage>101855</fpage>
          <pub-id pub-id-type="doi">10.1016/j.media.2020.101855</pub-id>
          <pub-id pub-id-type="medline">33260116</pub-id>
          <pub-id pub-id-type="pii">S1361-8415(20)30219-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carneiro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zorron Cheng Tao Pu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Burt</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Deep learning uncertainty and confidence calibration for the five-class polyp classification from colonoscopy</article-title>
          <source>Med Image Anal</source>
          <year>2020</year>
          <month>05</month>
          <volume>62</volume>
          <fpage>101653</fpage>
          <pub-id pub-id-type="doi">10.1016/j.media.2020.101653</pub-id>
          <pub-id pub-id-type="medline">32172037</pub-id>
          <pub-id pub-id-type="pii">S1361-8415(20)30020-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jacobs</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Neural network calibration for medical imaging classification using DCA regularization</article-title>
          <year>2020</year>
          <conf-name>International Conference on Machine Learning (ICML)</conf-name>
          <conf-date>July 17</conf-date>
          <conf-loc>Virtual workshop</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.gatsby.ucl.ac.uk/~balaji/udl2020/accepted-papers/UDL2020-paper-137.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jørgensen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jalaboi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Olsen</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Improving uncertainty estimation in convolutional neural networks using Inter-rater agreement</article-title>
          <source>Medical Image Computing and Computer Assisted Intervention</source>
          <year>2019</year>
          <conf-name>MICCAI</conf-name>
          <conf-date>Oct 13-17</conf-date>
          <conf-loc>Shenzhen, China</conf-loc>
          <fpage>540</fpage>
          <lpage>548</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-030-32251-9_59</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Combalia</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hueto</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Puig</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Malvehy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vilaplana</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Uncertainty estimation in deep neural networks for dermoscopic image classification</article-title>
          <year>2020</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</conf-name>
          <conf-date>June 14-19</conf-date>
          <conf-loc>Seattle, WA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvprw50498.2020.00380</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayhan</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Kühlewein</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Aliyeva</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Inhoffen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ziemssen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Berens</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Expert-validated estimation of diagnostic uncertainty for deep neural networks in diabetic retinopathy detection</article-title>
          <source>Med Image Anal</source>
          <year>2020</year>
          <month>08</month>
          <volume>64</volume>
          <fpage>101724</fpage>
          <pub-id pub-id-type="doi">10.1016/j.media.2020.101724</pub-id>
          <pub-id pub-id-type="medline">32497870</pub-id>
          <pub-id pub-id-type="pii">S1361-8415(20)30088-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Simonyan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zisserman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Very deep convolutional networks for large-scale image recognition</article-title>
          <source>arXiv. Preprint posted online Sep 4, 2014</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1409.1556.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep residual learning for image recognition</article-title>
          <year>2016</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>June 27-30</conf-date>
          <conf-loc>Las Vegas, NA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>van</surname>
              <given-names>DML</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Densely connected convolutional networks</article-title>
          <year>2017</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>July 21-26</conf-date>
          <conf-loc>Honolulu, HA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2017.243</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Srikant</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Enhancing the reliability of out-of-distribution image detection in neural networks</article-title>
          <year>2018</year>
          <conf-name>6th International Conference on Learning Representations (ICLR)</conf-name>
          <conf-date>Apr 30 - May 3</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/pdf?id=H1VGkIxRZ"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Szegedy</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Vanhoucke</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ioffe</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Rethinking the inception architecture for computer vision</article-title>
          <year>2016</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>June 27-30</conf-date>
          <conf-loc>Las Vegas, NA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2016.308</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Revisiting the evaluation of uncertainty estimation and its application to explore model complexity-uncertainty trade-off</article-title>
          <year>2020</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</conf-name>
          <conf-date>June 14-19</conf-date>
          <conf-loc>Seattle, WA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvprw50498.2020.00010</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
