<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e69145</article-id><article-id pub-id-type="doi">10.2196/69145</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Optimizing Loop Diuretic Treatment for Mortality Reduction in Patients With Acute Dyspnea Using a Practical Offline Reinforcement Learning Pipeline for Health Care: Retrospective Single-Center Simulation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Jung Min</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tang</surname><given-names>Shengpu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sjoding</surname><given-names>Michael</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wiens</surname><given-names>Jenna</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Division of Computer Science and Engineering, College of Engineering, University of Michigan</institution><addr-line>2260 Hayward St</addr-line><addr-line>Ann Arbor</addr-line><addr-line>MI</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Computer Science, Emory College of Arts and Sciences, Emory University</institution><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Division of Pulmonary and Critical Care Medicine, Michigan Medicine, University of Michigan</institution><addr-line>Ann Arbor</addr-line><addr-line>MI</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Dong</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yoo</surname><given-names>Dongjoon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Meng</surname><given-names>Fei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jenna Wiens, PhD, Division of Computer Science and Engineering, College of Engineering, University of Michigan, 2260 Hayward St, Ann Arbor, MI, 48109, United States, 1 7346474832; <email>wiensj@umich.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>10</day><month>10</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e69145</elocation-id><history><date date-type="received"><day>24</day><month>11</month><year>2024</year></date><date date-type="rev-recd"><day>07</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>15</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jung Min Lee, Shengpu Tang, Michael Sjoding, Jenna Wiens. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 10.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e69145"/><abstract><sec><title>Background</title><p>Offline reinforcement learning (RL) has been increasingly applied to clinical decision-making problems. However, due to the lack of a standardized pipeline, prior work often relied on strategies that may lead to overfitted policies and inaccurate evaluations.</p></sec><sec><title>Objective</title><p>In this work, we present a practical pipeline&#x2014;Pipeline for Learning Robust Policies in Reinforcement Learning (PROP-RL)&#x2014;designed to improve robustness and minimize disruption to clinical workflow. We demonstrate its efficacy in the context of learning treatment policies for administering loop diuretics in hospitalized patients.</p></sec><sec sec-type="methods"><title>Methods</title><p>Our cohort included adult inpatients admitted to the emergency department at Michigan Medicine between 2015 and 2019 who required supplemental oxygen. We modeled the management of loop diuretics as an offline RL problem using a discrete state space based on features extracted from electronic health records, a binary action space corresponding to the daily use of loop diuretics, and a reward function based on in-hospital mortality. The policy was trained on data from 2015 to 2018 and evaluated on a held-out set of hospitalizations from 2019, in terms of estimated reduction in mortality compared to clinician behavior.</p></sec><sec sec-type="results"><title>Results</title><p>The final study cohort included 36,570 hospitalizations. The learned treatment policy was based on 60 states: the policy deferred to clinicians in 36 states, recommended the majority action in 22 states, and diverged significantly from clinician behavior in 2 of the states. Among the cases where the policy meaningfully diverged from the behavior policy, the learned policy was estimated to significantly reduce the mortality rate from 3.8% to 2.2% by 1.6% (95% CI 0.4&#x2010;2.7; <italic>P</italic>=.006).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>We applied our pipeline to the clinical problem of loop diuretic treatment, highlighting the importance of robust state representation and thoughtful policy selection and evaluation. Our work reveals areas of potential improvement in current clinical care for loop diuretics and serves as a blueprint for using offline RL for sequential treatment selection in clinical settings.</p></sec></abstract><kwd-group><kwd>reinforcement learning</kwd><kwd>artificial intelligence</kwd><kwd>loop diuretic</kwd><kwd>treatment recommendation</kwd><kwd>treatment selection</kwd><kwd>clinical decision support</kwd><kwd>dynamic treatment regime</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Reinforcement learning (RL) is a branch of artificial intelligence that, through interactions with an environment, learns the optimal sequence of actions that will maximize a desired outcome [<xref ref-type="bibr" rid="ref1">1</xref>]. RL methods are especially well suited to tackle problems that require sequential decision-making where the rewards are delayed. This makes it an attractive solution for learning dynamic treatment policies in health care problems (eg, sepsis [<xref ref-type="bibr" rid="ref2">2</xref>], diabetes [<xref ref-type="bibr" rid="ref3">3</xref>], and hypotension [<xref ref-type="bibr" rid="ref4">4</xref>]) where decisions are made sequentially over a prolonged period of time and the outcome (eg, in-hospital mortality) is observed at a later time point. Due to safety and ethical concerns, training and evaluation of RL policies in this domain often rely on a fixed set of historical data and require the use of offline RL algorithms [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>However, effectively applying offline RL poses several challenges. First, deriving a robust and informative state representation from high-dimensional health features can be challenging, especially with limited data. Second, the performance of offline RL algorithms is sensitive to hyperparameters [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>], often leading to policies that perform well during development but fail once deployed. Yet a standardized approach for hyperparameter selection has not been established for offline RL. Third, the learned policy may differ substantially from current clinician behavior, resulting in low confidence in evaluation results and potential disruption to clinical workflows [<xref ref-type="bibr" rid="ref9">9</xref>]. While some of these issues have been solved in isolation [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>], there is a notable absence of a standard pipeline for applying offline RL, comparable to the training-validation framework in supervised learning, that integrates these individual solutions. We address this gap by presenting a pipeline (Pipeline for Learning Robust Policies in Reinforcement Learning; PROP-RL) along with a codebase for applying offline RL to health care settings, and demonstrate its efficacy by applying it to the problem of learning treatment policies for loop diuretics.</p><p>Loop diuretics are one of the most commonly prescribed medications in hospitals and are used to control volume and edema in the body by increasing urinary sodium and water excretion [<xref ref-type="bibr" rid="ref13">13</xref>]. They are used to treat patients with acute shortness of breath from fluid accumulation in their lungs, typically associated with conditions such as congestive heart failure or acute pulmonary edema [<xref ref-type="bibr" rid="ref14">14</xref>]. There remains substantial uncertainty and variability regarding when to start and stop loop diuretics [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. This uncertainty leads to inadequate use of loop diuretics, which has been associated with worse clinical outcomes, including higher rates of acute kidney injury and electrolyte disturbances [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>In this paper, we apply offline RL to learn a loop diuretics treatment policy&#x2014;designed to aid health care professionals&#x2014;from electronic health records (EHRs) of hospitalized patients at a large academic hospital. In doing so, we establish a pipeline&#x2014;PROP-RL&#x2014;for applying offline RL in health care settings that incorporates state representation learning, hyperparameter selection, and modification of the learned policy to minimize disruption to existing workflows. We demonstrate the effectiveness of PROP-RL through off-policy evaluation (OPE) and ablation studies [<xref ref-type="bibr" rid="ref19">19</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Cohort</title><p>We included adult patients (&#x2265;18 years) admitted to the hospital through the emergency department at Michigan Medicine during the years 2015&#x2010;2019, who required any amount of supplemental oxygen support during the first 24 hours of admission. Patients who underwent surgery within 24 hours of admission were excluded as the supplemental oxygen support provided may not be due to a primary respiratory condition (Section A1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The cohort was split into a development set and a held-out test set consisting of data from patients admitted in 2015&#x2010;2018 and 2019, respectively.</p></sec><sec id="s2-2"><title>Data Preprocessing</title><p>To formulate the management of loop diuretics as an RL problem with discrete time steps, we split the hospitalization data into chronological windows. With the exception of the first and second windows, all windows were 24 hours long, starting and ending at 6 AM (<xref ref-type="fig" rid="figure1">Figure 1A</xref>; Section A2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In each window, medication records were analyzed to determine whether an oral or intravenous loop diuretic was administered. We assumed all treatment decisions made within a window were based on the patient&#x2019;s state in the previous window. A 6 AM cutoff time was chosen as most clinical rounds (where decisions are made) occur immediately after this point. Analysis was constrained to the first 8 days of hospitalization.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>(A) Diagram of the windowing rule for hospitalizations. (B) Overview of pipeline. (1) Data partition: the development data <italic>D</italic><sub><italic>dev</italic></sub> are partitioned in multiple ways to create the data partitions <italic>D</italic><sub><italic>i</italic></sub>, <italic>i</italic> &#x2208; {1...10}. (2) Defining the state space: a set of candidate discrete state definitions, characterized by the data partition <italic>D</italic><sub><italic>j</italic></sub> used to derive the state definition and the number of states <italic>k</italic>, is generated by learning a lower-dimensional representation of the features and clustering them. (3) Estimating the behavior policy: the behavior policy <italic>&#x03C0;<sub>b</sub></italic> is estimated from the development state using each state definition <italic>S<sub>j,k</sub></italic>. While <italic>&#x03C0;<sub>b</sub></italic> is dependent on <italic>S<sub>j,k</sub></italic>, for simplicity, we refer to the behavior policy as <italic>&#x03C0;<sub>b</sub></italic> in general. (4) Training and selecting the RL policy: a policy <italic>&#x03C0;<sub>i, j, k, l</sub></italic> is trained for each possible hyperparameter combination across all data partitions. The best hyperparameter is used to train the final policy <italic>&#x03C0;<sup>*</sup></italic> on the entire development set. (5) Final policy evaluation: <italic>&#x03C0;<sup>*</sup></italic> is evaluated on the test set <italic>D<sub>test</sub></italic>. EHR: electronic health record; RL: reinforcement learning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e69145_fig01.png"/></fig><p>For each window, EHR features including age, vital sign measurements, laboratory test results, medications, fluid input and output, and Sequential Organ Failure Assessment (SOFA) scores were extracted (Section A3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). These features capture the patient&#x2019;s most recent health state as well as past treatments, which are necessary for determining future treatments. We used the Flexible Data-Driven Pipeline (FIDDLE) software to convert these into 243-dimensional feature vectors (Section A2, A3, and A16 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref20">20</xref>].</p></sec><sec id="s2-3"><title>Model Development and Evaluation</title><sec id="s2-3-1"><title>Overview</title><p>We modeled the patient environment as a Markov decision process (MDP) defined by (<italic>S</italic>, <italic>A</italic>, <italic>P</italic>, <italic>R</italic>, <italic>&#x03B3;</italic>). <italic>S</italic> and <italic>A</italic> represent the state and action spaces. Given a hospitalization, <italic>s<sub>t</sub></italic>&#x2208;<italic>S</italic> represents the patient&#x2019;s health on day <italic>t</italic> and <italic>a<sub>t</sub></italic> &#x2208; <italic>A</italic> is the treatment decision made based on <inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. <italic>P</italic> (<italic>s</italic><sub><italic>t</italic>+1</sub>|<italic>s</italic><sub><italic>t</italic></sub>, <italic>a</italic><sub><italic>t</italic></sub>) is the transition function, <italic>R</italic>(<italic>s<sub>t</sub></italic>) = <italic>r<sub>t</sub></italic> is the reward function, and <italic>&#x03B3;</italic> &#x2208; [0, 1] is the discount factor. The discrete state space <italic>S</italic> was defined by clustering the EHR features in a learned embedding space. The action space <italic>A</italic> = {0, 1} was defined to encode binary treatment decisions, corresponding to whether the patient received loop diuretics (Section A4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). All intermediate rewards were set to 0, and a terminal reward was given when the patient&#x2019;s hospitalization ended or reached 8 days (whichever is earlier). The terminal reward was 100 if the patient was discharged alive and &#x2013;100 if the patient died. Our objective was to learn a policy <italic>&#x03C0;</italic>: <italic>S</italic>&#x00D7;<italic>A</italic> &#x2192; [0, 1] which maps <italic>s</italic><sub><italic>t</italic></sub> to a probability distribution over <italic>a</italic><sub><italic>t</italic></sub>, in order to maximize the expected cumulative reward <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>J</mml:mi><mml:mo>(</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:mo>)</mml:mo><mml:mo>=</mml:mo><mml:mi mathvariant="double-struck">E</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x03C0;</mml:mi><mml:mi> </mml:mi></mml:mrow></mml:msub><mml:mfenced open="[" close="]" separators="|"><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mn>9</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x03B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mi>R</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:mfenced></mml:math></inline-formula> where <italic>&#x03B3;</italic>=0.99. This roughly corresponds to an objective that focuses on minimizing the overall mortality rate.</p><p>PROP-RL consists of the following 5 steps (<xref ref-type="fig" rid="figure1">Figure 1B</xref>): (1) data partition, (2) defining the state space, (3) estimating the behavior policy, (4) training and selecting the RL policy, and (5) final policy evaluation.</p></sec><sec id="s2-3-2"><title>Step 1. Data Partition</title><p>We created 10 partitions of the development set by randomly assigning each hospitalization to either the training or validation split. These partitions were used for steps (2) and (4).</p></sec><sec id="s2-3-3"><title>Step 2. Defining the State Space</title><p>We used a data-driven approach to establish state definitions. For each data partition, a function mapping the 243-dimensional feature space to a discrete state space was learned by training a neural network embedding model and applying ensemble <italic>k</italic>-means clustering in the embedding space (Section A5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. <italic>k</italic>, the size of the discrete state space, was a hyperparameter that varied from {20,40,...,160} (Section A6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The state definition itself was treated as a hyperparameter.</p><p>Prior to policy learning, the state definitions were evaluated for generalizability and informativeness. We verified that each hospitalization transitioned across multiple different states and that the state distribution was not heavily skewed toward a few specific states. Failing to meet both criteria implies an overfitted state definition unlikely to generalize to new patients. Second, to ensure the embeddings captured important information, we conducted a principal component analysis of the cluster centers. We visualized the distribution of the cluster centers using the average and SD of the mortality rates, SOFA scores, and clinicians&#x2019; previous and next actions among the windows belonging to each state.</p></sec><sec id="s2-3-4"><title>Step 3. Estimating the Behavior Policy</title><p>We estimated the behavior policy by computing the average observed action for each state within the development set. This is a stochastic policy that maps each state to a probability over the binary actions. To further validate the state definitions, we performed 2 evaluations using the estimated behavior policy. First, we compared the estimated mortality rate of the behavior policy on the held-out test set to the true mortality rate. Significant differences in these values would either indicate a significant change in clinicians&#x2019; behavior between the 2 datasets, or the state definitions&#x2019; inability to encode the behavior policy. Second, we investigated whether the estimated behavior policy aligns with clinical understanding by visualizing trends in the behavior policy with respect to key features of the states (Section A7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-3-5"><title>Step 4. Training and Selecting the RL Policy</title><p>After learning the transition matrix from the training data, we learned the optimal policy using a modified version of value iteration with 2 offline RL constraints: batch-constrained Q-learning (BCQ) and pessimistic Markov decision process (pMDP) [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. These constraints mitigate extrapolation error, which refers to inaccurate value estimations for state-action pairs that were rarely or never observed during training [<xref ref-type="bibr" rid="ref23">23</xref>]. In brief, BCQ constrains the policy to avoid actions unlikely to be selected by the behavior policy, and pMDP encourages the policy to avoid areas in the state-action space with high uncertainty (Section A8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Both BCQ and pMDP have additional hyperparameters.</p><p>Recent work found hyperparameter selection in offline RL to be sensitive to the partitioning of the dataset [<xref ref-type="bibr" rid="ref7">7</xref>]. To mitigate this, we use the Split-Select-Retrain (SSR) pipeline that selects the optimal hyperparameters by aggregating validation performance over multiple partitions of the development set [<xref ref-type="bibr" rid="ref7">7</xref>]. The final policy is then learned from the entire development set using the selected optimal hyperparameters. We leveraged the same 10 partitions (train and validation split) described in step (1) (Section A9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Performance was measured using the OPE method weighted importance sampling (WIS), known for its simplicity and reliance on relatively few assumptions [<xref ref-type="bibr" rid="ref25">25</xref>]. WIS is a biased but consistent estimator, with estimates converging to the true value as sample size increases. WIS estimates both the performance and effective sample size (ESS) of the policy, which is a measure of confidence in the performance estimate [<xref ref-type="bibr" rid="ref26">26</xref>]. ESS values closer to the size of the dataset used for evaluation (ie, validation set) indicate higher confidence in the performance estimate. For the main analysis, we focus on WIS, but for robustness, we also consider 3 additional OPE methods: fitted Q evaluation, approximate model, and weighted doubly robust estimates (Section A10 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for methodological details). <italic>P</italic> values are estimated by a one-sided bootstrap resampling test [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>In order to minimize disruption to clinical workflow without sacrificing policy performance, we modified the learned policy prior to evaluation by identifying &#x201C;unimportant states,&#x201D; inspired by Shen et al [<xref ref-type="bibr" rid="ref28">28</xref>]. In unimportant states, no action can significantly impact the outcome. Our policies deferred to clinicians&#x2019; decisions in unimportant states, thus minimizing the amount of potential deviation from clinician behavior. The threshold used to determine unimportance was considered a hyperparameter (Section A11 and A12 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-3-6"><title>Step 5. Final Policy Evaluation</title><p>The final policy was evaluated on the held-out test set using WIS. Improvement in performance compared to the behavior policy was measured across 1000 bootstrapped samples in terms of expected cumulative reward and mortality. The level of disagreement between the average clinician and the final learned policy was compared to the level of disagreement among clinicians (Section A13 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>To understand how the learned policy differs from the behavior policy, we focused on &#x201C;divergent&#x201D; states where the action recommended by the learned policy diverged from the majority action of the behavior policy. The learned policy was then evaluated on a subset of the cohort where the patient&#x2019;s hospitalization included divergent states. We further characterized these states by comparing the average values of their key features to those of the general population (Section A14 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec></sec><sec id="s2-4"><title>Ablation Studies of Pipeline</title><p>Our pipeline included 3 key elements designed to improve the robustness of the learned policy: (1) using unimportant states to &#x201C;relax&#x201D; the learned policy, (2) evaluation across multiple data partitions (SSR), and (3) treating state definitions as a hyperparameter. To demonstrate the effect of each element on the robustness of the learned policy, we conducted an ablation study by selectively removing each component from the pipeline. As a proxy for measuring robustness, we looked at the worst-case OPE performance of the learned policies to establish an empirical lower bound.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>This study was approved by the Institutional Review Board at the University of Michigan Medical School (HUM00141899) with a waiver of informed consent among study patients. All data collected were deidentified and were accessed via a secure cloud storage platform and a secured, Health Insurance Portability and Accountability Act (HIPAA)&#x2013;compliant server. Participants were not compensated for the use of their data in this study. The study followed the TRIPOD+AI (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis+Artificial Intelligence) reporting guideline [<xref ref-type="bibr" rid="ref29">29</xref>] (<xref ref-type="supplementary-material" rid="app2">Checklist 1</xref>). As this study was retrospective in nature, no formal study protocol was developed and the study was not registered. No patients or the public were involved in any aspect of this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Cohort and Patient Characteristics</title><p>The initial cohort consisted of 57,907 hospitalizations. We removed cases where supplemental oxygen was not given within 24 hours (n=14,902), patients were moved to surgery within 24 hours (n=6283), and hospitalizations lasting shorter than 2 windows (n=152). The final study population contained 23,945 unique patients and 36,570 unique hospitalizations divided temporally by admission year into the development (n=29,765; 2015&#x2010;2018) and test set (n=6805; 2019) (<xref ref-type="table" rid="table1">Table 1</xref>; Section A1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The mortality rate of the entire cohort was 5.4% (1978/36,570), and 5.2% (1555/29,765) and 6.2% (423/6805) for the development and test set, respectively.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Cohort characteristics. Values are numbers (percentages) unless stated otherwise.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cohort</td><td align="left" valign="bottom">Overall (2015&#x2010;2019)</td><td align="left" valign="bottom">Development set (2015&#x2010;2018)</td><td align="left" valign="bottom">Test set (2019)</td></tr></thead><tbody><tr><td align="left" valign="top">Hospitalizations, n</td><td align="left" valign="top">36,570</td><td align="left" valign="top">29,765</td><td align="left" valign="top">6805</td></tr><tr><td align="left" valign="top">Age (years), median (IQR)</td><td align="left" valign="top">64 (53&#x2010;74)</td><td align="left" valign="top">64 (52&#x2010;74)</td><td align="left" valign="top">65 (54&#x2010;75)</td></tr><tr><td align="left" valign="top" colspan="4">Age range (years), n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>18&#x2010;25</td><td align="left" valign="top">1142 (3.1)</td><td align="left" valign="top">1010 (3.4)</td><td align="left" valign="top">124 (1.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26&#x2010;45</td><td align="left" valign="top">4747 (13.0)</td><td align="left" valign="top">3889 (13.1)</td><td align="left" valign="top">858 (12.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>46&#x2010;65</td><td align="left" valign="top">13,770 (37.7)</td><td align="left" valign="top">11,266 (37.8)</td><td align="left" valign="top">2504 (36.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>66&#x2010;85</td><td align="left" valign="top">14,317 (39.1)</td><td align="left" valign="top">11,477 (38.6)</td><td align="left" valign="top">2840 (41.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003E;85</td><td align="left" valign="top">2594 (7.1)</td><td align="left" valign="top">2123 (7.1)</td><td align="left" valign="top">471 (6.9)</td></tr><tr><td align="left" valign="top" colspan="4">Sex, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">17,364 (47.5)</td><td align="left" valign="top">14,241 (47.8)</td><td align="left" valign="top">3123 (45.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">19,206 (52.5)</td><td align="left" valign="top">15,524 (52.2)</td><td align="left" valign="top">3682 (54.1)</td></tr><tr><td align="left" valign="top" colspan="4">Self-reported race, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White or Caucasian</td><td align="left" valign="top">30,529 (83.5)</td><td align="left" valign="top">24,853 (83.5)</td><td align="left" valign="top">5676 (83.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Black or African American</td><td align="left" valign="top">4295 (11.7)</td><td align="left" valign="top">3503 (11.8)</td><td align="left" valign="top">792 (11.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Asian</td><td align="left" valign="top">642 (1.8)</td><td align="left" valign="top">516 (1.7)</td><td align="left" valign="top">126 (1.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>American Indian or Alaska Native</td><td align="left" valign="top">141 (0.4)</td><td align="left" valign="top">115 (0.4)</td><td align="left" valign="top">26 (0.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Native Hawaiian or Other Pacific Islander</td><td align="left" valign="top">27 (0.1)</td><td align="left" valign="top">21 (0.1)</td><td align="left" valign="top">6 (0.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">659 (1.8)</td><td align="left" valign="top">546 (1.8)</td><td align="left" valign="top">113 (1.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unknown</td><td align="left" valign="top">209 (0.6)</td><td align="left" valign="top">160 (0.5)</td><td align="left" valign="top">49 (0.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient refused</td><td align="left" valign="top">68 (0.2)</td><td align="left" valign="top">51 (0.2)</td><td align="left" valign="top">17 (0.2)</td></tr><tr><td align="left" valign="top" colspan="4">Hospitalization outcome, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Alive</td><td align="left" valign="top">34,592 (94.6)</td><td align="left" valign="top">28,210 (94.8)</td><td align="left" valign="top">6382 (93.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Death</td><td align="left" valign="top">1978 (5.4)</td><td align="left" valign="top">1555 (5.2)</td><td align="left" valign="top">423 (6.2)</td></tr><tr><td align="left" valign="top">Length of stay (days), median (IQR)</td><td align="left" valign="top">6 (4-9)</td><td align="left" valign="top">6 (4-9)</td><td align="left" valign="top">6 (4-10)</td></tr><tr><td align="left" valign="top" colspan="4">Length of stay (days), n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1&#x2010;3</td><td align="left" valign="top">8216 (22.5)</td><td align="left" valign="top">6939 (23.3)</td><td align="left" valign="top">1277 (18.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4&#x2010;5</td><td align="left" valign="top">9446 (25.8)</td><td align="left" valign="top">7736 (26.0)</td><td align="left" valign="top">1710 (25.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>6&#x2010;9</td><td align="left" valign="top">10,027 (27.4)</td><td align="left" valign="top">8063 (27.1)</td><td align="left" valign="top">1964 (28.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>10&#x2010;15</td><td align="left" valign="top">5136 (14.0)</td><td align="left" valign="top">4046 (13.6)</td><td align="left" valign="top">1090 (16.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003E;15</td><td align="left" valign="top">3745 (10.2)</td><td align="left" valign="top">2981 (10.0)</td><td align="left" valign="top">764 (11.2)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Evaluation of State Definitions</title><p>The final state definition <italic>S<sub>j,k</sub></italic> (<italic>j</italic>=7, <italic>k</italic>=60) was selected from the hyperparameter search. Overall, 98.3% (29,270/29,765) of hospitalizations in the development set and 98.9% (6730/6805) in the test set contained at least 2 distinct states, indicating at least one transition between different states within these hospitalizations (<xref ref-type="fig" rid="figure2">Figure 2A</xref>). All states appeared relatively uniformly in the data with each state constituting 1.7% (SD 0.5%) of all windows on average in both the development set (3769, SD 1187; 226,178 total windows) and test set (893, SD 268; 53,591 total windows) (<xref ref-type="fig" rid="figure2">Figure 2B</xref>; Section B2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for test set results).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Sanity checks for the state definitions on the development set. The first row shows histograms depicting (A) number of states in each trajectory and (B) the number of windows in each state. The second and third rows show principal component analysis of the representations of the cluster centers that define each state. The hue and size of each dot represent the average and SD of the feature value of all samples in that state. The features are (C) mortality rate, (D) SOFA score, (E) whether loop diuretics were administered in the past 24 hours, and (F) whether clinicians chose to administer loop diuretics. SOFA: Sequential Organ Failure Assessment.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e69145_fig02.png"/></fig><p>Plots of the cluster centers show that information regarding mortality and clinicians&#x2019; actions is encoded in the states. <xref ref-type="fig" rid="figure2">Figure 2C and 2D</xref> indicate that a visible gradient exists in the state representation space with respect to both average mortality and average SOFA score. <xref ref-type="fig" rid="figure2">Figure 2E and 2F</xref> show a distinct separation in the state representation space in terms of both the clinician&#x2019;s previous and next actions. Note that even without being explicitly trained for it, the state representation space captures information about the previous action.</p></sec><sec id="s3-3"><title>Evaluation of the Estimated Behavior Policy</title><p>The estimated mortality rate of the clinician behavior policy across 1000 bootstrapped samples was 6.2% (95% CI 5.6&#x2010;6.8) (Section B9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This was comparable to the true mortality rate of 6.2% (95% CI 5.6%&#x2010;6.8%) observed in the held-out test set, suggesting the state definitions have accurately captured clinicians&#x2019; behavior. Qualitatively, we found the estimated behavior policy to recommend loop diuretics if the patient is older, given loop diuretics the previous day, has higher brain natriuretic peptide (BNP) values, and has higher blood urea nitrogen values (<xref ref-type="fig" rid="figure3">Figure 3</xref>). We report trends for additional features in Section B3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>The relationship between the clinician&#x2019;s likelihood of administering loop diuretics and key features. Features shown are (from left to right, top to bottom): mortality, SOFA score, age, whether loop diuretics were administered in the past 24 hours, BNP value, and blood urea nitrogen value. The height of the bars represents the average value of each feature within the state, and the color represents the clinician&#x2019;s likelihood of administering loop diuretics. BNP: brain natriuretic peptide; SOFA: Sequential Organ Failure Assessment.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e69145_fig03.png"/></fig></sec><sec id="s3-4"><title>Evaluation of the Final Learned Policy</title><p>Of the 60 states, 36 were unimportant and the learned policy deferred to clinicians (<xref ref-type="fig" rid="figure4">Figure 4A</xref>). For the remaining 24 states, the learned policy tended to recommend the majority action: among 21,759 windows belonging to these states in the test set, only 3858 (17.7%) windows were assigned a different action under the learned policy. Yet in 2 divergent states (states 10 and 44), the learned policy did not follow the majority action. While the learned policy always recommended loop diuretics to be administered for both states, clinicians only took this action 34% (454/1326) and 35% (568/1614) of the time, respectively (<xref ref-type="fig" rid="figure4">Figure 4B</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>(A) Comparison of the actions recommended by the clinician behavior policy and the learned policy for each state. The color of the boxes indicates the probability of giving loop diuretics. States are ordered by decreasing likelihood of clinicians prescribing loop diuretics. Hatched boxes indicate &#x201C;unimportant&#x201D; states where the learned policy recommends the same actions as the behavior policy. (B) Likelihood of agreement between the clinician behavior policy and the learned policy for each state. On the left graph, states are ordered by decreasing likelihood of agreement with the clinicians. The right bar graph focuses on the 10 states where the clinicians disagree the most with the learned policy. States 10 and 44, where the likelihood of agreement is less than 0.5 (learned policy does not follow majority action), are defined as &#x201C;divergent&#x201D; states.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e69145_fig04.png"/></fig><p>On the entire held-out test set, the learned policy outperformed the behavior policy 967 times across 1000 bootstraps (96.7%) and was estimated to reduce mortality from 6.2% to 5.7%, by 0.5 (95% CI 0.0&#x2212;1.1; <italic>P</italic>=.03) percentage points on average (<xref ref-type="table" rid="table2">Table 2</xref>). The ESS of the learned policy was 3168.45 (95% CI 3090.91-3256.65), nearly half the size of the dataset (n=6805) indicating a high confidence in the WIS estimate (Section B7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for validation set results). On the subset of hospitalizations with divergent states, the learned policy outperformed the behavior policy 994 times across 1000 bootstraps (99.4%) and significantly decreased the estimated overall mortality from 3.8% to 2.2% by 1.6 (95% CI 0.4&#x2010;2.8; <italic>P</italic>=.006) percentage points on average (<xref ref-type="table" rid="table2">Table 2</xref>). The ESS of the learned policy was 550.39 (95% CI 511.49&#x2010;588.70), approximately 25% of the sample size (n=2152) and indicated a high confidence in the performance estimate. Similar improvements were observed with other OPE methods (Section B10 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Quantitative evaluation of behavior and learned policy on the held-out test set and a subset of the test set where the patient trajectories included the 2 divergent states. Values in parentheses indicate the 95% CI across 1000 bootstraps.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom" colspan="2">Held-out test set (n=6805)</td><td align="left" valign="bottom" colspan="2">Subset with divergent states (n=2152)</td></tr><tr><td align="left" valign="top">Policy</td><td align="left" valign="top">Behavior policy</td><td align="left" valign="top">Learned policy</td><td align="left" valign="top">Behavior policy</td><td align="left" valign="top">Learned policy</td></tr></thead><tbody><tr><td align="left" valign="top">Estimated <italic>J</italic> (<italic>&#x03C0;</italic>) (95% CI)</td><td align="left" valign="top">87.56 (86.42 to 88.74)</td><td align="left" valign="top">88.59<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> (87.10 to 90.01)</td><td align="left" valign="top">92.40 (90.89 to 93.96)</td><td align="left" valign="top">95.57<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (93.10 to 97.89)</td></tr><tr><td align="left" valign="top">Estimated improvement in <italic>J</italic> (<italic>&#x03C0;</italic>) (95% CI)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">1.03 (&#x2212;0.05 to 2.10)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">3.17 (0.77 to 5.46)</td></tr><tr><td align="left" valign="top">Estimated mortality (%) (95% CI)</td><td align="left" valign="top">6.22 (5.63 to 6.79)</td><td align="left" valign="top">5.70 (4.99 to 6.45)</td><td align="left" valign="top">3.80 (3.02 to 4.56)</td><td align="left" valign="top">2.22 (1.06 to 3.45)</td></tr><tr><td align="left" valign="top">Estimated decrease in mortality (%) (95% CI)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.52 (&#x2212;0.03 to 1.05)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.58 (0.38 to 2.75)</td></tr><tr><td align="left" valign="top">Effective sample size (95% CI)</td><td align="left" valign="top">6805</td><td align="left" valign="top">3168.46 (3090.91 to 3256.65)</td><td align="left" valign="top">2152</td><td align="left" valign="top">550.39 (511.49 to 588.70)</td></tr><tr><td align="left" valign="top">% of time outperformed behavior policy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">96.70</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">99.40</td></tr><tr><td align="left" valign="top">Disagreement with clinician (%) (95% CI)</td><td align="left" valign="top">22.91 (22.61 to 23.18)</td><td align="left" valign="top">21.19 (20.86 to 21.49)</td><td align="left" valign="top">30.80 (30.63 to 30.96)</td><td align="left" valign="top">32.38 (32.19 to 32.57)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup><italic>P</italic>=.03.</p></fn><fn id="table2fn2"><p><sup>b</sup><italic>P</italic>=.006.</p></fn><fn id="table2fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>State visualization (Section B4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) found that states 10 and 44 are close in the embedding space. Feature importance analysis of classifiers for each state showed a large overlap in key features (Section B5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), including age, previous loop diuretic, BNP value, and blood urea nitrogen value. Both states consisted of slightly older patients with an average age of 69.8 (SD 13.9) (state 10) and 68.7 (SD 13.5) (state 44) compared to the population mean of 63.4 (SD 16.2). Patients in both groups had higher BNP values (539.7, SD 960.8 vs 405.5, SD 781.8 vs 397.1, SD 687.0 for state 10, state 44, and the population, respectively) and mild kidney impairment as characterized by higher blood urea nitrogen values (35.6, SD 25.2 vs 32.9, SD 21.6 vs 28.2, SD 21.3 for state 10, state 44, and the population, respectively; Section B6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-5"><title>Ablation Study of Pipeline</title><p>In all cases, the worst-case performance of the learned policy when one or more components were removed from the pipeline was significantly lower than the worst-case performance of the policy derived from the full pipeline (<xref ref-type="table" rid="table3">Table 3</xref>). We focus on the 2 novel aspects of the pipeline here: relaxing the unimportant states and tuning the state definitions.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Worst-case performance of the learned policy when one or more of the 3 key elements in the pipeline were removed. The 3 elements are: (1) use of unimportant state relaxation (no vs yes), (2) number of data splits (single vs multiple), and (3) number of state definitions (single vs multiple). Values in parentheses indicate the 95% CI across 1000 bootstraps.<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Unimportant state relaxation</td><td align="left" valign="bottom">Number of data splits</td><td align="left" valign="bottom">Number of state definitions</td><td align="left" valign="bottom">Estimated improvement in <italic>J</italic> (<italic>&#x03C0;</italic>) (95% CI) (&#x2191;)</td><td align="left" valign="bottom">Estimated mortality % (95% CI) (&#x2193;)</td><td align="left" valign="bottom">% Time outperformed behavior policy (&#x2191;)</td></tr></thead><tbody><tr><td align="left" valign="top">No</td><td align="left" valign="top">Single</td><td align="left" valign="top">Single</td><td align="left" valign="top">No viable policy</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">No</td><td align="left" valign="top">Single</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">No viable policy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">No</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">Single</td><td align="left" valign="top">No viable policy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">No</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">No viable policy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Yes</td><td align="left" valign="top">Single</td><td align="left" valign="top">Single</td><td align="left" valign="top">&#x2212;2.48 (&#x2212;8.80 to 2.09)</td><td align="left" valign="top">7.46 (4.93 to 10.76)</td><td align="left" valign="top">20.20</td></tr><tr><td align="left" valign="top">Yes</td><td align="left" valign="top">Single</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">&#x2212;0.04 (&#x2212;0.63 to 0.55)</td><td align="left" valign="top">6.24 (5.59 to 6.90)</td><td align="left" valign="top">44.90</td></tr><tr><td align="left" valign="top">Yes</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">Single</td><td align="left" valign="top">0.45 (&#x2212;1.48 to 2.22)</td><td align="left" valign="top">6.00 (4.93 to 7.18)</td><td align="left" valign="top">70.70</td></tr><tr><td align="left" valign="top">Yes</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">Multiple</td><td align="left" valign="top">1.03 (&#x2212;0.05 to 2.10)</td><td align="left" valign="top">5.70 (4.99 to 6.45)</td><td align="left" valign="top">96.70</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>The estimated improvement in <italic>J</italic> (<italic>&#x03C0;</italic>) (&#x2191;) and estimated mortality % (&#x2193;) for the behavior policy is 0.00 (95% CI &#x2212;1.14 to 1.18) and 6.22 (95% CI 5.63 to 6.79), respectively.</p></fn><fn id="table3fn2"><p><sup>b</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>Removing the unimportant state relaxation led to a catastrophic failure, as no policy obtained an ESS of at least 10% the validation dataset size. This indicates overfitting, and we were unable to get a reliable estimate of the policies&#x2019; performance on the test set. Using a fixed state definition instead of tuning the state definitions led to significant variation in the performance of the learned policy depending on the data split used to learn the fixed state definition (Section B8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In the worst-case scenario, the improvement in value of the learned policy compared to the behavior policy was &#x2212;0.04 (95% CI &#x2212;0.63 to 0.55), which was significantly lower than the improvement in value of 1.03 (95% CI &#x2212;0.05 to 2.10; <italic>P</italic>=.01) of the policy derived from the full pipeline.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Offline RL has been applied to various health care domains [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. However, a clear guide that practitioners can refer to has not been established. We present a blueprint based on previous literature to streamline the development of offline RL policies and further facilitate this through a public code base. We demonstrated the utility of our rigorous pipeline in the context of learning treatment decision policies for loop diuretics in hospitalized patients. Overall, in retrospective analysis, the learned policy was estimated to lead to significant improvement in outcome for the general patient population, especially for a subset of patients where the learned policy differed the most from clinician behavior. Though it will require prospective validation, our results reveal areas of potential improvement in current clinical care.</p><p>A key challenge in offline RL is ensuring the robustness of the learned policy. Two elements in our pipeline contributed to the improvement in robustness and performance. The first element&#x2014;tuning state definitions&#x2014;addresses the issue of hyperparameter sensitivity in offline RL. To select the optimal hyperparameters, prior work often relied on the hold-out method which partitioned the development dataset into training and validation sets [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Recently, Nie et al [<xref ref-type="bibr" rid="ref7">7</xref>] found policy performance to be sensitive to this partitioning itself and proposed the SSR pipeline which uses multiple dataset partitions during evaluation. Building upon this insight, we show that while common practice has been to use a fixed state definition derived from a single train-validation split [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], the partitioning used to learn the state definitions can also result in significant variability of the final policy&#x2019;s performance, and thus jointly tuning state definitions and policy learning over multiple data partitions is important for robustness of the learned policy.</p><p>The second element&#x2014;relaxing the learned policy via unimportant states&#x2014;is a form of policy constraint that mitigates the impact of extrapolation error by reducing the deviation of the learned policy from the behavior policy [<xref ref-type="bibr" rid="ref5">5</xref>]. Using unimportant states to constrain the policy post hoc also helps reduce disruptions to the current workflow, an important consideration in health care settings. During deployment, the policy acts as an alert system to notify providers of the appropriate treatment [<xref ref-type="bibr" rid="ref32">32</xref>]. Yet a well-known consequence in alert systems is &#x201C;alert fatigue,&#x201D; where providers ignore alerts due to the high frequency of irrelevant or unhelpful alerts [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. By generating recommendations only when the action will meaningfully impact the outcome, unimportant state relaxation presents a simple solution to reduce disruptions to existing workflows while minimally compromising the policy&#x2019;s performance.</p><p>In analyzing our learned policy, we found that loop diuretics had a limited effect on patient outcome for a sizable portion of the cohort. Our pipeline could thus be used to identify patient groups that are likely responsive to treatments. In these treatment-responsive cases, the learned policy tended to agree with the majority of clinicians, indicating that our policy could help reduce heterogeneity in treatment decisions. Patients in the 2 divergent states were slightly older and had mild kidney impairment, which could explain clinicians&#x2019; hesitancy in prescribing loop diuretics. However, the high BNP values indicate that the patients are fluid overloaded and may still benefit from diuretic treatment.</p><p>Our study is not without limitations. The pipeline used a single OPE method (WIS) during hyperparameter selection and a single dataset. While designed to be agnostic to both, future studies using external datasets and different OPE methods during hyperparameter tuning will further validate the generalizability of PROP-RL. Our problem formulation enforced decisions to be binary and to occur every 24 hours at fixed time points (Section B1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for results across different decision points). A finer-grained problem formulation&#x2014;such as specifying the exact dosage, incorporating additional actions (ie, other medications), and using shorter or more flexible time intervals for actions&#x2014;along with additional data will be required to learn a policy that can be deployed in clinical settings. A promising direction for future work is incorporating clinician feedback after deployment to further refine the alert threshold and better understand when recommendations are most useful to clinicians, beyond our current approach of using unimportant states (Section A15 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Another important limitation is our reliance on retrospective evaluation. In the absence of a reliable simulator and safety concerns associated with real-world evaluation, we relied on OPE methods which may not reflect the policy&#x2019;s true performance during deployment. We mitigate this by imposing a large cutoff on the ESS during hyperparameter selection and by confirming our findings across multiple OPE methods. Nonetheless, retrospective evaluation should only be viewed as a preliminary step for identifying promising policies prior to investing in prospective studies. Future work must include robust prospective validation in accordance with guidelines such as the DECIDE-AI reporting framework [<xref ref-type="bibr" rid="ref34">34</xref>]. The potential for unmeasured confounding is also a fundamental limitation of OPE methods. To mitigate this, we derived our state space using a comprehensive set of EHR features, selected in close consultation with a clinical collaborator with deep domain expertise. However, residual confounding may remain. Since these challenges are present in any realistic problem setting, our approach serves as a guide for other researchers to follow when learning offline RL policies.</p></sec><sec id="s4-2"><title>Conclusion</title><p>In summary, we present a standardized pipeline to streamline the development of offline RL policies in health care settings. We demonstrate the utility of this pipeline in the context of learning treatment decision policies for loop diuretics in hospitalized patients and show that the learned RL policy could potentially lead to a significant improvement in a key subset of the patient population. Our work highlights important considerations for applying RL to observational data to learn treatment decision policies, and our open-sourced code base can facilitate future development of offline RL policies on other clinical problems.</p></sec></sec></body><back><ack><p>This work was supported by the National Heart, Lung, and Blood Institute of the National Institutes of Health (NIH) under grant R01HL158626 to JW and MS, and the National Library of Medicine of the NIH under grant R01LM013325 to JW and MS. The funders had no role in study design, data collection, data analysis, data interpretation, writing of the report, and the decision to submit. ST was affiliated with the Division of Computer Science and Engineering at the University of Michigan at the time of this research and is currently affiliated with the Department of Computer Science at Emory University.</p></ack><notes><sec><title>Data Availability</title><p>The source code used in this study is available from the Pipeline for Learning Robust Policies in Reinforcement Learning (PROP-RL) repository [<xref ref-type="bibr" rid="ref19">19</xref>]. The datasets generated or analyzed during this study are not publicly available due to patient privacy and ethical restrictions, but deidentified data could be made available from the corresponding author on reasonable request. A sample of synthetic data is provided with the source code.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BCQ</term><def><p>batch-constrained Q-learning</p></def></def-item><def-item><term id="abb2">BNP</term><def><p>brain natriuretic peptide</p></def></def-item><def-item><term id="abb3">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb4">ESS</term><def><p>effective sample size</p></def></def-item><def-item><term id="abb5">FIDDLE</term><def><p>Flexible Data-Driven Pipeline</p></def></def-item><def-item><term id="abb6">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb7">MDP</term><def><p>Markov decision process</p></def></def-item><def-item><term id="abb8">OPE</term><def><p>off-policy evaluation</p></def></def-item><def-item><term id="abb9">PCA</term><def><p>principal component analysis</p></def></def-item><def-item><term id="abb10">pMDP</term><def><p>pessimistic Markov decision process</p></def></def-item><def-item><term id="abb11">PROP-RL</term><def><p>Pipeline for Learning Robust Policies in Reinforcement Learning</p></def></def-item><def-item><term id="abb12">RL</term><def><p>reinforcement learning</p></def></def-item><def-item><term id="abb13">SOFA</term><def><p> Sequential Organ Failure Assessment</p></def></def-item><def-item><term id="abb14">SSR</term><def><p>Split-Select-Retrain</p></def></def-item><def-item><term id="abb15">TRIPOD+AI</term><def><p>Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis+Artificial Intelligence</p></def></def-item><def-item><term id="abb16">WIS</term><def><p>weighted importance sampling</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sutton</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Barto</surname><given-names>AG</given-names> </name></person-group><source>Reinforcement Learning, Second Edition: An Introduction</source><year>2018</year><publisher-name>MIT Press</publisher-name><pub-id pub-id-type="other">978-0-262-03924-6</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Komorowski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Celi</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Badawi</surname><given-names>O</given-names> </name><name name-style="western"><surname>Gordon</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Faisal</surname><given-names>AA</given-names> </name></person-group><article-title>The artificial intelligence clinician learns optimal treatment strategies for sepsis in intensive care</article-title><source>Nat Med</source><year>2018</year><month>11</month><volume>24</volume><issue>11</issue><fpage>1716</fpage><lpage>1720</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0213-5</pub-id><pub-id pub-id-type="medline">30349085</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name><name name-style="western"><surname>Georgiou</surname><given-names>P</given-names> </name></person-group><article-title>Offline deep reinforcement learning and off-policy evaluation for personalized basal insulin control in type 1 diabetes</article-title><source>IEEE J Biomed Health Inform</source><year>2023</year><month>10</month><volume>27</volume><issue>10</issue><fpage>5087</fpage><lpage>5098</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2023.3303367</pub-id><pub-id pub-id-type="medline">37607154</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>An interpretable RL framework for pre-deployment modeling in ICU hypotension management</article-title><source>NPJ Digit Med</source><year>2022</year><month>11</month><day>18</day><volume>5</volume><issue>1</issue><fpage>173</fpage><pub-id pub-id-type="doi">10.1038/s41746-022-00708-4</pub-id><pub-id pub-id-type="medline">36396808</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Levine</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tucker</surname><given-names>G</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name></person-group><article-title>Offline reinforcement learning: tutorial, review, and perspectives on open problems</article-title><source>arXiv</source><comment>Preprint posted online on  May 4, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.01643</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Paine</surname><given-names>TL</given-names> </name><name name-style="western"><surname>Paduraru</surname><given-names>C</given-names> </name><name name-style="western"><surname>Michi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Hyperparameter selection for offline reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 17, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2007.09055</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nie</surname><given-names>A</given-names> </name><name name-style="western"><surname>Flet-Berliac</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Steenbergen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Brunskill</surname><given-names>E</given-names> </name></person-group><article-title>Data-efficient pipeline for offline reinforcement learning with limited data</article-title><access-date>2025-09-29</access-date><conf-name>Proceedings of the 36th Conference on Neural Information Processing Systems</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><conf-loc>New Orleans, LA, USA</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2022/file/5ee7ed60a7e8169012224dec5fe0d27f-Paper-Conference.pdf">https://proceedings.neurips.cc/paper_files/paper/2022/file/5ee7ed60a7e8169012224dec5fe0d27f-Paper-Conference.pdf</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tucker</surname><given-names>G</given-names> </name><name name-style="western"><surname>Levine</surname><given-names>S</given-names> </name></person-group><article-title>Conservative q-learning for offline reinforcement learning</article-title><access-date>2025-09-21</access-date><conf-name>Proceedings of the 34th Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2020/file/0d2b2061826a5df3221116a5085a6052-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2020/file/0d2b2061826a5df3221116a5085a6052-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gottesman</surname><given-names>O</given-names> </name><name name-style="western"><surname>Johansson</surname><given-names>F</given-names> </name><name name-style="western"><surname>Komorowski</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Guidelines for reinforcement learning in healthcare</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>16</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0310-5</pub-id><pub-id pub-id-type="medline">30617332</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McMurray</surname><given-names>JJV</given-names> </name><name name-style="western"><surname>Adamopoulos</surname><given-names>S</given-names> </name><name name-style="western"><surname>Anker</surname><given-names>SD</given-names> </name><etal/></person-group><article-title>ESC guidelines for the diagnosis and treatment of acute and chronic heart failure 2012: The Task Force for the Diagnosis and Treatment of Acute and Chronic Heart Failure 2012 of the European Society of Cardiology. Developed in collaboration with the Heart Failure Association (HFA) of the ESC</article-title><source>Eur J Heart Fail</source><year>2012</year><month>08</month><volume>14</volume><issue>8</issue><fpage>803</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1093/eurjhf/hfs105</pub-id><pub-id pub-id-type="medline">22828712</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Killian</surname><given-names>TW</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Subramanian</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fatemi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ghassemi</surname><given-names>M</given-names> </name></person-group><article-title>An empirical study of representation learning for reinforcement learning in healthcare</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 23, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2011.11235</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wiens</surname><given-names>J</given-names> </name></person-group><article-title>Model selection for offline reinforcement learning: practical considerations for healthcare settings</article-title><access-date>2024-05-28</access-date><conf-name>Proceedings of the 6th Machine Learning for Healthcare Conference, PMLR</conf-name><conf-date>Aug 6, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v149/tang21a.html">https://proceedings.mlr.press/v149/tang21a.html</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Han</surname><given-names>SY</given-names> </name></person-group><article-title>Loop diuretics in clinical practice</article-title><source>Electrolyte Blood Press</source><year>2015</year><month>06</month><volume>13</volume><issue>1</issue><fpage>17</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.5049/EBP.2015.13.1.17</pub-id><pub-id pub-id-type="medline">26240596</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Cleland</surname><given-names>JGF</given-names> </name></person-group><article-title>Causes and treatment of oedema in patients with heart failure</article-title><source>Nat Rev Cardiol</source><year>2013</year><month>03</month><volume>10</volume><issue>3</issue><fpage>156</fpage><lpage>170</lpage><pub-id pub-id-type="doi">10.1038/nrcardio.2012.191</pub-id><pub-id pub-id-type="medline">23319101</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Berliner</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schneider</surname><given-names>N</given-names> </name><name name-style="western"><surname>Welte</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bauersachs</surname><given-names>J</given-names> </name></person-group><article-title>The differential diagnosis of dyspnea</article-title><source>Dtsch Arztebl Int</source><year>2016</year><month>12</month><day>9</day><volume>113</volume><issue>49</issue><fpage>834</fpage><lpage>845</lpage><pub-id pub-id-type="doi">10.3238/arztebl.2016.0834</pub-id><pub-id pub-id-type="medline">28098068</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Stevenson</surname><given-names>LW</given-names> </name></person-group><article-title>Searching for evidence: refractory questions in advanced heart failure</article-title><source>J Card Fail</source><year>2004</year><month>06</month><volume>10</volume><issue>3</issue><fpage>210</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.1016/j.cardfail.2003.10.006</pub-id><pub-id pub-id-type="medline">15190530</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Palazzuoli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ruocco</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ronco</surname><given-names>C</given-names> </name><name name-style="western"><surname>McCullough</surname><given-names>PA</given-names> </name></person-group><article-title>Loop diuretics in acute heart failure: beyond the decongestive relief for the kidney</article-title><source>Crit Care</source><year>2015</year><month>09</month><day>3</day><volume>19</volume><issue>1</issue><fpage>296</fpage><pub-id pub-id-type="doi">10.1186/s13054-015-1017-3</pub-id><pub-id pub-id-type="medline">26335137</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Matsue</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Damman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Voors</surname><given-names>AA</given-names> </name><etal/></person-group><article-title>Time-to-furosemide treatment and mortality in patients hospitalized with acute heart failure</article-title><source>J Am Coll Cardiol</source><year>2017</year><month>06</month><day>27</day><volume>69</volume><issue>25</issue><fpage>3042</fpage><lpage>3051</lpage><pub-id pub-id-type="doi">10.1016/j.jacc.2017.04.042</pub-id><pub-id pub-id-type="medline">28641794</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>PROP-RL: pipeline for learning robust policies in RL</article-title><source>GitHub</source><access-date>2024-04-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/MLD3/PROP-RL">https://github.com/MLD3/PROP-RL</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Davarmanesh</surname><given-names>P</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Koutra</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sjoding</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Wiens</surname><given-names>J</given-names> </name></person-group><article-title>Democratizing EHR analyses with FIDDLE: a flexible data-driven preprocessing pipeline for structured clinical data</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>12</month><day>9</day><volume>27</volume><issue>12</issue><fpage>1921</fpage><lpage>1934</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa139</pub-id><pub-id pub-id-type="medline">33040151</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blei</surname><given-names>D</given-names> </name><name name-style="western"><surname>Veitch</surname><given-names>V</given-names> </name></person-group><article-title>Adapting neural networks for the estimation of treatment effects</article-title><access-date>2024-05-28</access-date><conf-name>Proceedings of the 33rd Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 8-14, 2019</conf-date><conf-loc>Vancouver, Canada</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2019/file/8fb5f8be2aa9d6c64a04e3ab9f63feee-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2019/file/8fb5f8be2aa9d6c64a04e3ab9f63feee-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Strehl</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>J</given-names> </name></person-group><article-title>Cluster ensembles --- a knowledge reuse framework for combining multiple partitions</article-title><source>J Mach Learn Res</source><year>2003</year><month>03</month><day>1</day><volume>3</volume><fpage>583</fpage><lpage>617</lpage><pub-id pub-id-type="doi">10.1162/153244303321897735</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Fujimoto</surname><given-names>S</given-names> </name><name name-style="western"><surname>Meger</surname><given-names>D</given-names> </name><name name-style="western"><surname>Precup</surname><given-names>D</given-names> </name></person-group><article-title>Off-policy deep reinforcement learning without exploration</article-title><access-date>2025-09-21</access-date><conf-name>Proceedings of the 36th International Conference on Machine Learning, PMLR</conf-name><conf-date>Jun 9-15, 2019</conf-date><conf-loc>Long Beach, California, USA</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v97/fujimoto19a.html">https://proceedings.mlr.press/v97/fujimoto19a.html</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kidambi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rajeswaran</surname><given-names>A</given-names> </name><name name-style="western"><surname>Netrapalli</surname><given-names>P</given-names> </name><name name-style="western"><surname>Joachims</surname><given-names>T</given-names> </name></person-group><article-title>MOReL: model-based offline reinforcement learning</article-title><access-date>2024-05-28</access-date><conf-name>Proceedings of the 34th Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2020/file/f7efa4f864ae9b88d43527f4b14f750f-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2020/file/f7efa4f864ae9b88d43527f4b14f750f-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Precup</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sutton</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name></person-group><article-title>Eligibility traces for off-policy policy evaluation</article-title><conf-name>ICML &#x2019;00: Proceedings of the Seventeenth International Conference on Machine Learning</conf-name><conf-date>Jun 29 to Jul 2, 2000</conf-date><conf-loc>Stanford University, Stanford, CA, USA</conf-loc></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Martino</surname><given-names>L</given-names> </name><name name-style="western"><surname>Elvira</surname><given-names>V</given-names> </name><name name-style="western"><surname>Louzada</surname><given-names>F</given-names> </name></person-group><article-title>Effective sample size for importance sampling based on discrepancy measures</article-title><source>Signal Processing</source><year>2017</year><month>02</month><volume>131</volume><fpage>386</fpage><lpage>401</lpage><pub-id pub-id-type="doi">10.1016/j.sigpro.2016.08.025</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Moore</surname><given-names>D</given-names> </name><name name-style="western"><surname>Notz</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fligner</surname><given-names>M</given-names> </name></person-group><source>The Basic Practice of Statistics</source><year>2021</year><publisher-name>WH Freeman &#x0026; Co Ltd</publisher-name><pub-id pub-id-type="other">978-1-319-34463-4</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gottesman</surname><given-names>O</given-names> </name><name name-style="western"><surname>Doshi-Velez</surname><given-names>F</given-names> </name></person-group><article-title>State relevance for off-policy evaluation</article-title><access-date>2025-09-21</access-date><conf-name>Proceedings of the 38th International Conference on Machine Learning, PMLR</conf-name><conf-date>Jul 18-24, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v139/shen21d.html">https://proceedings.mlr.press/v139/shen21d.html</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><etal/></person-group><article-title>TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods</article-title><source>BMJ</source><year>2024</year><month>04</month><day>16</day><volume>385</volume><fpage>e078378</fpage><pub-id pub-id-type="doi">10.1136/bmj-2023-078378</pub-id><pub-id pub-id-type="medline">38626948</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kondrup</surname><given-names>F</given-names> </name><name name-style="western"><surname>Jiralerspong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Towards safe mechanical ventilation treatment using deep offline reinforcement learning</article-title><source>AAAI</source><year>2023</year><volume>37</volume><issue>13</issue><fpage>15696</fpage><lpage>15702</lpage><pub-id pub-id-type="doi">10.1609/aaai.v37i13.26862</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Greenstein</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Fackler</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Bergmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bembea</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Winslow</surname><given-names>RL</given-names> </name></person-group><article-title>Offline reinforcement learning with uncertainty for treatment strategies in sepsis</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 9, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2107.04491</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aaron</surname><given-names>S</given-names> </name><name name-style="western"><surname>McEvoy</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Ray</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hickman</surname><given-names>TTT</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>A</given-names> </name></person-group><article-title>Cranky comments: detecting clinical decision support malfunctions through free-text override reasons</article-title><source>J Am Med Inform Assoc</source><year>2019</year><month>01</month><day>1</day><volume>26</volume><issue>1</issue><fpage>37</fpage><lpage>43</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy139</pub-id><pub-id pub-id-type="medline">30590557</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wright</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ash</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Clinical decision support alert malfunctions: analysis and empirically derived taxonomy</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>05</month><day>1</day><volume>25</volume><issue>5</issue><fpage>496</fpage><lpage>506</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocx106</pub-id><pub-id pub-id-type="medline">29045651</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vasey</surname><given-names>B</given-names> </name><name name-style="western"><surname>Nagendran</surname><given-names>M</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Reporting guideline for the early stage clinical evaluation of decision support systems driven by artificial intelligence: DECIDE-AI</article-title><source>BMJ</source><year>2022</year><month>05</month><day>18</day><volume>377</volume><fpage>e070904</fpage><pub-id pub-id-type="doi">10.1136/bmj-2022-070904</pub-id><pub-id pub-id-type="medline">35584845</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional methodological details and supplementary results.</p><media xlink:href="medinform_v13i1e69145_app1.pdf" xlink:title="PDF File, 1796 KB"/></supplementary-material><supplementary-material id="app2"><label>Checklist 1</label><p>TRIPOD+AI checklist.</p><media xlink:href="medinform_v13i1e69145_app2.pdf" xlink:title="PDF File, 1349 KB"/></supplementary-material></app-group></back></article>