<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i3e33212</article-id>
      <article-id pub-id-type="pmid">35275063</article-id>
      <article-id pub-id-type="doi">10.2196/33212</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Improving the Prediction of Persistent High Health Care Utilizers: Retrospective Analysis Using Ensemble Methodology</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Coquet</surname>
            <given-names>Jean</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Nagavally</surname>
            <given-names>Sneha</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Howson</surname>
            <given-names>Stephanie N</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2854-6243</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>McShea</surname>
            <given-names>Michael J</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5110-1758</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Ramachandran</surname>
            <given-names>Raghav</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8845-8950</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Burkom</surname>
            <given-names>Howard S</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0667-9467</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Chang</surname>
            <given-names>Hsien-Yen</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7997-4822</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Weiner</surname>
            <given-names>Jonathan P</given-names>
          </name>
          <degrees>DrPH</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8299-3995</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Kharrazi</surname>
            <given-names>Hadi</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Center for Population Health Information Technology</institution>
            <institution>Johns Hopkins School of Public Health</institution>
            <addr-line>624 N Broadway</addr-line>
            <addr-line>Office 606</addr-line>
            <addr-line>Baltimore, MD, 21205-1900</addr-line>
            <country>United States</country>
            <phone>1 443 287 8264</phone>
            <email>kharrazi@jhu.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1481-4323</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Applied Physics Laboratory</institution>
        <institution>Johns Hopkins University</institution>
        <addr-line>Baltimore, MD</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Population Health Information Technology</institution>
        <institution>Johns Hopkins School of Public Health</institution>
        <addr-line>Baltimore, MD</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hadi Kharrazi <email>kharrazi@jhu.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>3</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>24</day>
        <month>3</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>3</issue>
      <elocation-id>e33212</elocation-id>
      <history>
        <date date-type="received">
          <day>27</day>
          <month>8</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>9</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>21</day>
          <month>2</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>11</day>
          <month>3</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Stephanie N Howson, Michael J McShea, Raghav Ramachandran, Howard S Burkom, Hsien-Yen Chang, Jonathan P Weiner, Hadi Kharrazi. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 24.03.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/3/e33212" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>A small proportion of high-need patients persistently use the bulk of health care services and incur disproportionate costs. Population health management (PHM) programs often refer to these patients as persistent high utilizers (PHUs). Accurate PHU prediction enables PHM programs to better align scarce health care resources with high-need PHUs while generally improving outcomes. While prior research in PHU prediction has shown promise, traditional regression methods used in these studies have yielded limited accuracy.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We are seeking to improve PHU predictions with an ensemble approach in a retrospective observational study design using insurance claim records.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We defined a PHU as a patient with health care costs in the top 20% of all patients for 4 consecutive 6-month periods. We used 2013 claims data to predict PHU status in next 24 months. Our study population included 165,595 patients in the Johns Hopkins Health Care plan, with 8359 (5.1%) patients identified as PHUs in 2014 and 2015. We assessed the performance of several standalone machine learning methods and then an ensemble approach combining multiple models.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The candidate ensemble with complement naïve Bayes and random forest layers produced increased sensitivity and positive predictive value (PPV; 49.0% and 50.3%, respectively) compared to logistic regression (46.8% and 46.1%, respectively).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our results suggest that ensemble machine learning can improve prediction of care management needs. Improved PPV implies reduced incorrect referral of low-risk patients. With the improved sensitivity/PPV balance of this approach, resources may be directed more efficiently to patients needing them most.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>persistent high utilizers</kwd>
        <kwd>ensemble methodology</kwd>
        <kwd>utilization</kwd>
        <kwd>prediction</kwd>
        <kwd>machine learning</kwd>
        <kwd>population health analytics</kwd>
        <kwd>retrospective</kwd>
        <kwd>observational</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Population health management (PHM) programs regularly classify patients by estimated risk of high health care utilization such as hospitalization [<xref ref-type="bibr" rid="ref1">1</xref>]. The classification process enables PHM programs to allocate their limited resources according to the patients’ anticipated needs [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Higher-risk patient groups, if identified correctly, can receive effective interventions such as care management program enrollment to reduce utilization and improve outcomes [<xref ref-type="bibr" rid="ref2">2</xref>]. Additionally, when utilization and costs are successfully contained for high-need patients by proactively preventing undesired outcomes, PHM programs can better allocate the remaining resources to improve the outcomes of other patients [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>The set of high-risk patients frequently changes over time, with most patients being high-risk for a short term [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, some high-risk patients use health care resources persistently for an extended period (eg, more than 24 months) [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. These persistent high utilizer (PHU) patients generally constitute a small segment of the overall patient population but use a considerable proportion of resources in long term [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. Despite the variety of approaches taken to characterize PHUs, such as adjusting for type of utilization, total costs, number of chronic conditions, and other factors, predicting who becomes a PHU has remained an analytical challenge [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref11">11</xref>].</p>
      <p>Past studies have applied several analytical approaches to identify and predict PHUs in different patient populations. These approaches range from traditional regression methods (eg, logistic regression) [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>] to complex machine learning techniques (eg, gradient boosting and neural networks) [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. Nonetheless, due to the small number of PHUs in a patient population (often less than 5%), most studies have suffered from either oversensitive models or excessive false predictions of high utilization [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Thus, the challenge of achieving simultaneously useful levels of sensitivity and positive predictive value (PPV) in PHU prediction models has limited their application in practice [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
      <p>To address the methodological challenges in predicting PHUs, this study tests an ensemble approach to balance the sensitivity and PPV of PHU forecasting at practical levels. The ensemble approach uses a mix of machine learning methodologies to improve both the sensitivity and PPV of PHU predictions at the same time. Using insurance claims data of a large patient population, this study compares the ensemble approach to single models, a baseline model, and a more advanced predictive model.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overall Aims and Definitions</title>
        <p>The overall goal of our study was to assess the value of ensemble methodology for achieving required levels of sensitivity and PPV for PHU prediction. Our analysis aimed to provide a methodology to optimize the tradeoff of highly sensitive and highly specific predictive models of PHUs using an ensemble approach.</p>
        <p>We defined a PHU as an individual who remained in the top 20% of highest health care costs for 4 consecutive 6-month periods (ie, total of 24 months after the base period) [<xref ref-type="bibr" rid="ref4">4</xref>]. Health care costs were defined as the sum of costs covered by the insurer and the patient’s out-of-pocket costs [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      </sec>
      <sec>
        <title>Data Source and Preparation</title>
        <p>We performed a retrospective analysis of the Johns Hopkins Health Care insurance claims data collected between 2013 and 2015. We applied the Johns Hopkins Adjusted Clinical Groups (ACG) software to the claims data to prepare the data for analysis [<xref ref-type="bibr" rid="ref13">13</xref>]. We categorized the diagnostic codes into higher-level diagnosis groupings called expanded diagnostic clusters (EDCs), and we grouped medication data into Rx-defined morbidity groups (RxMGs) [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. EDCs and RxMGs have been substantially validated in past studies and are routinely used for risk stratification in practice [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p>
      </sec>
      <sec>
        <title>Study Population</title>
        <p>Johns Hopkins Health Care claims data included 207,421 patients with at least 1 record in 2013 and at least 2 years of continuous enrollment between 2013 and 2015 (<xref rid="figure1" ref-type="fig">Figure 1</xref>). First, 27,518 patients with missing EDC diagnosis codes were excluded, since EDCs were used to predict PHU status within the population. Second, 14,308 patients with EDC codes indicating pregnancy/newborn status were removed, as the anticipated high utilization incurred by these patients are different from PHUs. The final study population included 165,595 patients (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Selection process of the study population. JHHC: Johns Hopkins Health Care; EDC: expanded diagnostic cluster.</p>
          </caption>
          <graphic xlink:href="medinform_v10i3e33212_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Predictors and Outcome</title>
        <p>Predictors (ie, independent variables) included demographics, EDCs, RxMGs, and other health utilization variables (eg, hospitalization) generated by the ACG system. Many of these predictors, including all EDCs and RxMGs, are categorical variables [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        <p>The outcome of interest, a binary variable, was whether a patient became a PHU after the base year (ie, incurred health care costs in the top 20% of all patients over 4 consecutive 6-month periods).</p>
      </sec>
      <sec>
        <title>Statistical Approach</title>
        <sec>
          <title>Ensemble Methodology</title>
          <p>PHUs constitute a small fraction of the patient population, hence producing a large class imbalance (ie, most patients are non-PHUs). A common issue with single model prediction of highly imbalanced classes is compromising PPV in favor of higher sensitivity. For example, a single predictive model of PHUs may result in many false positives (ie, low PPV) if aiming to capture all PHUs (ie, high sensitivity). However, ensemble models provide a unique opportunity to increase both PPV and sensitivity by combining substantially different predictive models. We hypothesized that an ensemble approach can predict PHUs with both a manageable PPV and an optimal sensitivity compared to basic and advanced single model predictions.</p>
          <p>We assessed several machine learning algorithms to predict PHU status among the study population. We also evaluated the performance of the ACG system, a comprehensive regression-based risk stratification tool commonly used in PHM practice [<xref ref-type="bibr" rid="ref13">13</xref>]. As hypothesized, each of these algorithms yielded average levels of PPV, and we used an ensemble methodology to boost the overall PHU prediction performance.</p>
          <p>Ensemble methods take inputs from multiple models and combine the outputs in various ways to strengthen prediction results [<xref ref-type="bibr" rid="ref15">15</xref>]. In classification problems with imbalanced classes, ensemble methods perform well because multiple models can contribute individual strong features to the overall prediction [<xref ref-type="bibr" rid="ref16">16</xref>]. Since PHUs make a fraction of the total population, the occurrence of a PHU in the data can be considered an anomaly [<xref ref-type="bibr" rid="ref4">4</xref>]. Sometimes referred to as anomaly detection, the supervised machine learning problem of classifying PHUs is known as the imbalanced class problem, where the majority class (ie, non-PHUs) is much more prevalent than the minority class (ie, PHUs).</p>
          <p>We chose the stacking ensemble model rather than the voting ensemble approach. The stacking ensemble model uses a metaclassifier to aggregate the results, but the voting ensemble model needs user-specified weights to combine the classifiers, hence adding an unpractical step [<xref ref-type="bibr" rid="ref15">15</xref>]. Thus, for this problem space and our data set, we chose the stacking ensemble. Stacking ensemble methods often use multiple model layers and a final prediction model layer. Each layer makes predictions on the input space given. We also used an additional parameter, feature propagation. This technique allows the passing of both features and predictions through each layer of the ensemble [<xref ref-type="bibr" rid="ref15">15</xref>]. <xref rid="figure2" ref-type="fig">Figure 2</xref> depicts the overall structure of our ensemble methodology and schematically shows how multiple layers can improve PPV and sensitivity simultaneously (<xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Stacking ensemble architecture. F&#38;P: feature selection and predictions; PHU: persistent high utilizer; non-PHU: nonpersistent high utilizer.</p>
            </caption>
            <graphic xlink:href="medinform_v10i3e33212_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Ensemble Component Model Selection</title>
          <p>The models selected as the layers in the ensemble method were chosen using common techniques, namely assessment of common classification algorithms and random search cross-validation for parameter tuning. Typically, machine learning models are assessed for performance and generalizability. Generalizability is difficult to quantify without large unseen data sets available for testing, but a common technique to test for overfitting is k cross-fold validation. This technique tests the machine learning model against many different subsets of data and then calculates an average of all tests. For classifying PHUs, generalizability is fundamentally important because future populations tested through these algorithms will have a large variety of differences, including demographic profiles and medical conditions. Accordingly, we employed several techniques to tune the performance and generalizability of individual models before constructing the layers of the stacking ensemble [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>].</p>
          <p>First, we incorporated an algorithm known as complement naïve Bayes (CNB), which often produces highly sensitive predictions when classes are imbalanced [<xref ref-type="bibr" rid="ref18">18</xref>]. The CNB model is derived from standard multinomial naïve Bayes [<xref ref-type="bibr" rid="ref18">18</xref>]. This model has 3 main parameters, alpha, fit prior, and norm [<xref ref-type="bibr" rid="ref18">18</xref>]. Alpha is a Laplace smoothing parameter that adjusts the shape and fit of the multinomial distribution. This parameter shifts and forms the training distribution to characterize the multidimensional space of the data. Fit prior refines class identification when only a single class is found in the training set, which can easily occur since PHUs occur infrequently in the data set. Fitting the priors of the classifier ensures that the majority class (ie, non-PHUs) still has some probability of not occurring, even though no other class is present in the training data. The norm parameter determines whether the training involves a second normalization of weights, an additional measure to bolster the performance on imbalanced class problems like PHU detection. Naïve Bayes models are very easy to train, so a fine-tuned parameter search was performed to find more than 1 robust CNB for use in the stacking ensemble [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
          <p>Second, we integrated a random forest (RF) classifier in the ensemble model. An RF model is a meta-estimator that fits numerous decision tree classifiers on subsets of data features and averages results (ie, polls) to improve performance [<xref ref-type="bibr" rid="ref19">19</xref>]. Decision trees, and by association RFs, are useful in several applications due to their explainability and ease of training. Decision trees do not require normalization and can accept categorical and numerical variables; however, a shortcoming of decision trees is their difficulty with generalization. Imprecise selection of hyperparameters will make the RF tree overly complex resulting in poor performance when facing unseen patterns [<xref ref-type="bibr" rid="ref19">19</xref>]. Since RFs are an estimator built by decision trees, many of the parameters are carried over, although additional parameters are available for the sampling and final averaging with the RF [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
          <p>All applicable parameters of an RF were varied through a random cross-validated grid search, but a few most notably contributed to overall performance and generalizability. These parameters include number of estimators, maximum depth, minimum samples to split, minimum samples at leaf, maximum number of features, and class weight. The number of estimators is the count of how many decision trees should be fitted to make up the RF [<xref ref-type="bibr" rid="ref19">19</xref>]. Increasing the number of estimators typically increases generalizability but must be monitored for computational complexity. Maximum depth fixes the maximum number of levels that each tree can have, which is critical in generalizability [<xref ref-type="bibr" rid="ref19">19</xref>]. If not set, the tree is continued until each leaf is pure, meaning the tree could learn the pattern of a single person in this population, which is not extensible to unseen populations. Minimum samples to split sets the minimum number of samples at the time of a split, ensuring that each leaf has at least n–1 samples. Minimum samples at leaf is very similar to minimum samples to split but controls samples at the leaf level. In this study, minimum samples at leaf was used to ensure edge cases (ie, unique PHU patterns) were still appropriately populated with training samples. Maximum number of features describes the method used to generate each tree which in certain use cases, taking the square root or log of the total number of features, can increase an RF’s performance [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
          <p>Class weight is the most important RF parameter for performance, although setting it can negatively impact generalizability [<xref ref-type="bibr" rid="ref19">19</xref>]. This parameter adjusts the prior weight on the positive class, which is important for imbalanced classes, and it pushes the decision tree fits to focus more closely on the minority class, making it more robust to edge cases. Since this model was designed to detect PHUs, favoring minority instead of majority class performance was key. Using specific class weights forced the decision trees to allow for a degradation in classifying non-PHUs in favor of an increase in PHU classification. Two RF models were selected from a random search cross-validation of parameters for use in the stacking ensemble. The final stacking ensemble model integrated the CNB and RF models into one predictive model.</p>
          <p>The final ensemble model used an 80/20 split for training and testing of the data. We performed a 5-fold cross-validation on hyperparameter search and recursive feature elimination.</p>
        </sec>
        <sec>
          <title>Performance Metrics</title>
          <p>Typically, positive and negative class performance are assessed equally using a metric such as F1 score. In this study, as the PHU versus non-PHU classes are unequal and the positive class would constitute an infrequent occurrence, only the positive class metrics were considered key for performance improvement. Therefore, we measured PPV and sensitivity metrics to assess performance of all models (ie, individual models and ensemble model). Both performance metrics describe the classification results for the positive class (ie, PHUs). PPV is the proportion of positive classifications that are truly PHUs. Sensitivity is the proportion of PHUs who were classified as positive.</p>
          <p>An important consideration in any machine learning algorithm evaluation is the balance among metrics. A simple way to find an appropriate balance is to change the threshold for classification. Choosing the appropriate threshold can be difficult for health care scenarios due to the risk of incorrect classification for an individual who needs treatment (ie, false negatives). Conversely, classifying too many healthy individuals at risk could overwhelm the resources available for interventions (ie, false positives). To address this issue, we calculated and then plotted sensitivity and PPV for 50 trials at thresholds spaced evenly .05 apart. We then calculated the discrimination threshold for the ensemble model to choose the optimal threshold of the PPV versus sensitivity metrics.</p>
          <p>Finally, we compared the PPV and sensitivity of select individual models, which achieved at least 40% performance in both metrics, with the ensemble methodology. The individual models included a logistic regression, the Johns Hopkins ACG model (out-of-box and with no further training) [<xref ref-type="bibr" rid="ref13">13</xref>], and a standalone RF model. The ensemble model included a stacking ensemble with multiple layers combining CNB and RF models.</p>
          <p>All analyses, including descriptive analysis, individual modeling, and ensemble approach, were performed in R (version 3.5.1, R Foundation for Statistical Computing). We used Python pandas and scikit-learn for all modeling pipeline efforts (eg, data cleaning, filtering, hyperparameter search, feature selection, and RF model). We used Python ML Ensemble for the ensemble model [<xref ref-type="bibr" rid="ref20">20</xref>]. We used Python Yellowbrick library to visualize the classification threshold of sensitivity versus positive predictive values. We used the Johns Hopkins ACG system to produce the ACG output and measure the ACG model’s performance [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Descriptive Analyses</title>
        <p>The study population comprised 165,595 unique patients including 8359 (5.1%) PHUs (<xref ref-type="table" rid="table1">Table 1</xref>). The PHU population’s average age was more than twice that of the non-PHU population (38.51 years vs 18.79 years). PHUs included fewer males (2735/8359, 32.7%) than non-PHUs (69,683/155,862, 44.7%). As expected, PHUs had more utilization than non-PHUs (1567/8359, 18.7% vs 3891/155,862, 2.5% for inpatient visits and 8332/8359, 99.7% vs 152,199/155,862, 97.3% for outpatient visits, respectively).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Specification of the study populations (n=165,595).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="220"/>
            <col width="0"/>
            <col width="280"/>
            <col width="0"/>
            <col width="250"/>
            <col width="0"/>
            <col width="220"/>
            <thead>
              <tr valign="bottom">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">Overall study population (n=165,595)</td>
                <td colspan="2">Non-PHU<sup>a</sup> population (n=155,862)</td>
                <td>PHU population (n=8359)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">
                  <bold>Age (years), mean (SD)</bold>
                </td>
                <td colspan="2">19.85 (17.45)</td>
                <td colspan="2">18.79 (16.82)</td>
                <td>38.51 (18.01)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0-17, n (%)</td>
                <td colspan="2">101,264 (61.2)</td>
                <td colspan="2">99,352 (63.7)</td>
                <td colspan="2">1459 (17.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>18-64, n (%)</td>
                <td colspan="2">63,260 (38.2)</td>
                <td colspan="2">55,666 (35.7)</td>
                <td colspan="2">6730 (80.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>65+, n (%)</td>
                <td colspan="2">1037 (0.6)</td>
                <td colspan="2">844 (0.5)</td>
                <td colspan="2">170 (2.0)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">Sex (male), n (%)</td>
                <td colspan="2">72,974 (44.1)</td>
                <td colspan="2">69,683 (44.7)</td>
                <td>2735 (32.7)</td>
              </tr>
              <tr valign="top">
                <td colspan="8">
                  <bold>Race, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>White</td>
                <td colspan="2">41,492 (25.1)</td>
                <td colspan="2">38,762 (24.9)</td>
                <td colspan="2">2457 (29.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Black</td>
                <td colspan="2">54,207 (32.7)</td>
                <td colspan="2">50,993 (32.7)</td>
                <td colspan="2">2879 (34.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Other<sup>b</sup></td>
                <td colspan="2">149 (0.1)</td>
                <td colspan="2">143 (0.1)</td>
                <td colspan="2">6 (&#60;0.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Missing<sup>c</sup></td>
                <td colspan="2">69,747 (42.1)</td>
                <td colspan="2">65,964 (42.3)</td>
                <td colspan="2">3017 (36.1)</td>
              </tr>
              <tr valign="top">
                <td colspan="8">
                  <bold>Inpatient visits, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0</td>
                <td colspan="2">160,035 (96.6)</td>
                <td colspan="2">151,971 (97.5)</td>
                <td colspan="2">6792 (81.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1-5</td>
                <td colspan="2">5430 (3.3)</td>
                <td colspan="2">3866 (2.5)</td>
                <td colspan="2">1500 (17.9)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>6-10</td>
                <td colspan="2">77 (&#60;0.1)</td>
                <td colspan="2">20 (&#60;0.1)</td>
                <td colspan="2">54 (0.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>11+</td>
                <td colspan="2">19 (&#60;0.1)</td>
                <td colspan="2">5 (&#60;0.1)</td>
                <td colspan="2">13 (0.2)</td>
              </tr>
              <tr valign="top">
                <td colspan="8">
                  <bold>Outpatient visits, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0</td>
                <td colspan="2">3720 (2.2)</td>
                <td colspan="2">3663 (2.4)</td>
                <td colspan="2">27 (0.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1-5</td>
                <td colspan="2">96,122 (58.0)</td>
                <td colspan="2">94,138 (60.4)</td>
                <td colspan="2">1234 (14.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>6-10</td>
                <td colspan="2">33,996 (20.5)</td>
                <td colspan="2">32,317 (20.7)</td>
                <td colspan="2">1428 (17.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>11+</td>
                <td colspan="2">31,723 (19.2)</td>
                <td colspan="2">25,744 (16.5)</td>
                <td colspan="2">5670 (67.8)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>PHU: persistent high utilizer.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Members of known race/ethnicity not equal to Asian, Hispanic, White, or Black.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Members with empty values for race.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ensemble Model</title>
        <p>After tuning the ensemble layers, the best-performing ensemble model included 3 input layers and 1 prediction layer. The final ensemble model included 2 input layers of CNB and 1 layer of an RF model. The prediction layer was an RF model. The model included the following variables: race (ie, Black, White, other), age (as of 2013), sex, days of inpatient hospitalization in 2013, emergency department visit count in 2013, psychotherapy services in 2013, outpatient visit count in 2013, all-cause inpatient hospitalization count in 2013, frailty flag for older adults, 87 most frequent Johns Hopkins ACG diagnostic comorbidities (ie, EDCs [<xref ref-type="bibr" rid="ref13">13</xref>]), all Johns Hopkins ACG medication grouping (ie, RxMGs [<xref ref-type="bibr" rid="ref13">13</xref>]), and ACG-derived care coordination risk scores [<xref ref-type="bibr" rid="ref13">13</xref>] (ie, likely coordination issue, possible coordination issue, unlikely coordination issue). These variables are generated by and included in the John Hopkins ACG risk stratification models, which are widely used for PHM efforts [<xref ref-type="bibr" rid="ref13">13</xref>]. The stacking ensemble had full feature propagation throughout the layers to allow each model access to all data attributes while gaining classification scores from previous layers. The most performant models were selected for use in the stacking ensemble.</p>
      </sec>
      <sec>
        <title>Model Performance Evaluation</title>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> depicts the discrimination threshold plot for a sample decision tree of the ensemble model. The plot conveys the importance of the threshold choice and depicts the tradeoff between PPV and sensitivity. As shown in the figure, patients A and B, both of whom are PHUs, will be identified differently by the model depending on the chosen threshold between PPV and sensitivity. By testing the trained model on these 2 patients, a risk score is generated for each. These risk scores can be compared to any classification threshold. Depending on which side of the threshold the risk scores lie, the model classified whether patient A, B, or both are PHUs or non-PHUs.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Classification threshold of sensitivity versus positive predictive value (PPV): patient A: incorrectly classified as normal (risk score=82%) and patient B: correctly classified as a persistent high utilizer (risk score=97%).</p>
          </caption>
          <graphic xlink:href="medinform_v10i3e33212_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The central line in <xref rid="figure3" ref-type="fig">Figure 3</xref> represents the median value for each metric, and the bands represent the variability from the 10th to 90th percentiles. Two important observations about the threshold plot are (1) the typical classification threshold of .50 is not ideal probably due to the imbalanced classes and (2) equally weighting sensitivity and PPV at a threshold of .85 may not be appropriate to classify enough PHUs correctly. Patients A and B in <xref rid="figure3" ref-type="fig">Figure 3</xref> have different classification outcomes and therefore interventions due in part to an arbitrary threshold.</p>
        <p>To replicate the same level of optimality across all models, we used the 95th percentile threshold limit for each model. The absolute cutoff points were slightly different across models with ensemble having an absolute cutoff threshold of .258, RF .224, logistic regression .230, and the ACG model a cutoff of .226. Negative predictive value (NPV) and specificity were also assessed, but performance in these metrics was high (ie, averaging 97% and 99% for NPV and specificity, respectively) and did not vary significantly between models due to the large size and variability of the negative class (ie, non-PHUs).</p>
      </sec>
      <sec>
        <title>Performance Comparison</title>
        <p>The stacking ensemble method achieved a sensitivity of 49.0% and PPV of 50.3%. The ensemble model resulted in a 5%+ increase in both PPV and sensitivity for predicting PHUs over other individual methods such as logistic regression, RF model, and the ACG model (<xref ref-type="table" rid="table2">Table 2</xref>). As shown in <xref ref-type="table" rid="table2">Table 2</xref>, the individual RF was the highest performing nonensemble technique. <xref ref-type="table" rid="table2">Table 2</xref> also includes the optimal parameters used in the stacking ensemble (eg, CNB and RF parameters such as alpha, maximum depth, and minimum sample splits). The final ensemble model also produced an NPV of 97.4%, specificity of 97.3%, and F1 of 49.1% for PHUs and 97.4% for non-PHUs (not shown in <xref ref-type="table" rid="table2">Table 2</xref>). The area under the curve of the ensemble model reached .921; however, comparison of areas under the curve between models was considered not valuable due to the large imbalance of PHUs versus non-PHUs, hence limiting the performance measure comparison to PPV and sensitivity of the models.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Model fit statistics for predicting persistent high utilizer status.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="230"/>
            <col width="440"/>
            <col width="170"/>
            <col width="160"/>
            <thead>
              <tr valign="bottom">
                <td>Model</td>
                <td>Parameter tuning</td>
                <td>Sensitivity, %</td>
                <td>PPV<sup>a</sup>, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Stacking ensemble<break/>Layer 1: CNB<sup>b</sup><break/>Layer 2: CNB<break/>Layer 3: RF<sup>c</sup><break/>Prediction layer: RF<break/>Feature propagation</td>
                <td>CNB1 𝛼=.70, fit prior, norm<break/>CNB2 𝛼=.15, fit prior<break/>RF1<break/><list list-type="bullet"><list-item><p>200 estimators</p></list-item><list-item><p>400 max<sup>d</sup> depth</p></list-item><list-item><p>5 min<sup>e</sup> samples split</p></list-item><list-item><p>0.01% min samples</p></list-item></list>Leaf<break/><list list-type="bullet"><list-item><p>auto max features</p></list-item><list-item><p>class weight=0.842</p></list-item></list>RF2<break/><list list-type="bullet"><list-item><p>100 estimators</p></list-item><list-item><p>350 max depth</p></list-item><list-item><p>2 min samples split</p></list-item><list-item><p>0.01% min samples</p></list-item></list>Leaf<break/><list list-type="bullet"><list-item><p>class weight=1.0</p></list-item></list></td>
                <td>49.0</td>
                <td>50.3</td>
              </tr>
              <tr valign="top">
                <td>RF</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>300 estimators</p>
                    </list-item>
                    <list-item>
                      <p>500 max depth</p>
                    </list-item>
                    <list-item>
                      <p>20 min samples split</p>
                    </list-item>
                    <list-item>
                      <p>0.01% min samples leaf</p>
                    </list-item>
                  </list>
                </td>
                <td>48.4</td>
                <td>47.2</td>
              </tr>
              <tr valign="top">
                <td>JHU-ACG<sup>f</sup></td>
                <td>ACG<sup>g</sup> system probability of PHU<sup>h</sup></td>
                <td>44.7</td>
                <td>44.1</td>
              </tr>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>Based on 241 parameters (ie, diagnoses and medications)</td>
                <td>46.8</td>
                <td>46.1</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>PPV: positive predictive value.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>CNB: complement naïve Bayes.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>RF: random forest.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>max: maximum.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>min: minimum.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>JHU-ACG: ACG predictive model with no local tuning.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>ACG: adjusted clinical group.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>PHU: persistent high utilizer.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Persistent high utilizers (PHUs) are defined as patients who consistently stay in the highest deciles of health care costs or utilization across multiple years [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Risk stratification efforts strive to better identify and manage PHUs so that scarce health care resources can be better allocated. Nonetheless, predicting who becomes a PHU is often challenging, partly because PHUs are uncommon [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. Past studies have attempted to improve the prediction of PHUs in various populations; however, those predictions have either suffered from high false negative/positive rates or have been limited in scope [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. In this study, to address the methodological complexity in predicting PHUs, we evaluated the benefit of an ensemble approach to balance the sensitivity and specificity of predicting PHUs.</p>
        <p>Our results show that ensemble methodology can be effectively used to improve both sensitivity and PPV of predicting PHUs. The ensemble model developed in this study included 2 layers of CNB and 1 prediction layer of RF, which can be converged rather quickly. We achieved a sensitivity and PPV of 49.0% and 50.3%, respectively, using the ensemble model. In comparison to the best alternative performing model, which was the standalone RF, the ensemble model improved the sensitivity by 0.6 and PPV by 3.1 absolute percentage points, which represents a 1.2% and 6.6% relative improvement in sensitivity and PPV, respectively. Moreover, standalone RF models are prone to overfitting and often lack generalizability to other populations. The ensemble model was also superior compared to traditional logistic regression and the more established (ACG) models [<xref ref-type="bibr" rid="ref13">13</xref>]. The ensemble model improved the sensitivity and PPV of predicting PHUs by 2.2 and 4.2 absolute percentage points (ie, 4.7% and 9.1% relative improvement) compared to the traditional logistic regression and by 4.3 and 6.2 absolute percentage points (ie, 9.6% and 14.1% relative improvement) when compared to the ACG model [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
        <p>Several studies have examined the use of traditional methods in predicting PHUs; however, models developed in these studies have often generated low PPV rates or showed limited generalizability. For example, in a study of an employer-based health plan, using commercial claims data, a logistic regression model achieved a sensitivity of 80% but PPV of 19% to predict PHUs among the health plan enrollees [<xref ref-type="bibr" rid="ref6">6</xref>]. In another study aiming to predict PHUs, using diagnostic and medication information extracted from claims data, a regression model achieved a sensitivity of 46.7% and PPV of 57.2%; however, the study population was limited to patients aged 18 to 62 years, hence limiting generalizability to other populations [<xref ref-type="bibr" rid="ref4">4</xref>]. Several studies have used regression models to control for underlying demographic and clinical variables and measure the residual differences such as cost, behavioral health, and social determinants of health variables between PHU and non-PHU populations [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. These studies, however, have not published the performance of these regression models in predicting PHUs.</p>
        <p>A few studies have assessed the value of machine learning methods in predicting PHUs. In a study of a statewide Medicaid population, demographics, diagnostics, and medication information were used to predict costs associated with PHUs. The study compared multiple models including linear regression, regularized regression, gradient boosting machine, and recurrent neural networks, but the study did not generate comparable predictive measures as these models did not predict PHU status [<xref ref-type="bibr" rid="ref9">9</xref>]. Another study applied penalized regression, support vector machine, and extreme gradient boosting against claims data to predict PHUs among patients from an academic medical center. The study achieved high sensitivity rates ranging from 72.7% to 78.7%; however, the (recalculated) PPV ranged from 18.6% to 19.8% [<xref ref-type="bibr" rid="ref10">10</xref>]. Among the machine learning studies targeting PHUs, only one study compared an ensemble methodology (using RFs) to other methods (eg, linear regression, decision tree regression) [<xref ref-type="bibr" rid="ref11">11</xref>]. This study, however, predicted cost of PHUs and was limited to patients with schizophrenia, hence limiting its generalizability to the broader population of patients.</p>
        <p>Despite the promising findings of past studies in predicting PHUs, their results cannot be accurately compared to our ensemble model as each study used a slightly different definition of PHU. Some studies have defined PHUs as patients in the top 5% of cost over 2 years [<xref ref-type="bibr" rid="ref4">4</xref>], while other studies have set the bar at 10% or 20% of cost over longer periods of time [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Future research should attempt to harmonize the definition of PHUs to make the comparison of PHU populations across different populations and health plans feasible. Additionally, harmonization of the PHU definition can facilitate the performance measurement and comparison of PHU predictive models across different health care settings.</p>
        <p>Balancing the sensitivity and PPV of PHU predictions is key in operationalizing such models in PHM efforts. Indeed, given the infrequency of PHUs in the total population of patients, a balanced sensitivity and PPV ratio will play an important role in the management of limited resources for PHUs. In our study, the improvement of model performance compared to the traditional models corresponds to approximately 84 additional PHUs being classified correctly in the test set of 1672 true PHUs. These 84 patients would not have been reviewed for potential proactive interventions by a care manager if tested by a traditional method.</p>
        <p>In this study, we chose to report classification performance at the balanced precision and recall scores (50/50) to highlight optimal performance in both metrics simultaneously. In specific PHM use cases, it may be desirable to select a lower classification threshold and more patients for care or intervention consideration, even if their individual risk score is lower. In large-scale PHM use cases, cost of considering many patients may be too high and a higher classification threshold is to be selected to only manage the most at-risk patients. Hence, individual population health programs may chose different balances of precision versus recall for models predicting PHUs.</p>
        <p>Our study showed that machine learning has a performance advantage over traditional statistical models. Ultimately, improved performance will come from more advanced ensemble methods coupled with continually improving robustness of feature analysis, which together are the keys to significantly increased performance. Model performance could benefit from subpopulation training by reducing the large and variable parameter space for classification. Thus, developing custom groupings of clinical features associate with PHU patients (versus non-PHUs) can potentially advance predictive models of PHUs. For example, clinical groupings identified by unsupervised machine learning techniques (such as latent class analysis) has shown value in improving predictive models of PHUs [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>Value-based health care providers are increasingly using risk stratification tools to manage their patient populations [<xref ref-type="bibr" rid="ref22">22</xref>]. Providers often use local electronic health records (EHRs) instead of insurance claims to risk stratify patients and predict PHUs [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. Although advances have been made in using unique EHR data to improve risk prediction using prescription data [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>], vital signs [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>], laboratory results [<xref ref-type="bibr" rid="ref31">31</xref>], and free-text analysis [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], quality of EHR data remains a major challenge in this process [<xref ref-type="bibr" rid="ref34">34</xref>]. Using machine learning models, such as the ensemble models, can potentially help providers address some of these deficiencies and improve the prediction of PHUs using EHR data [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Future studies should investigate the usability of machine learning models in enhancing EHR-based PHU predictions and its implication on improving the wider population-level health outcomes [<xref ref-type="bibr" rid="ref37">37</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our study has several limitations. First, the results of our ensemble approach and the improvement of the PHU prediction may not generalize to other populations (eg, older adults), different settings (eg, inpatient only), or alternative data sources (eg, EHRs). Future research should explore the use of ensemble methodology in new populations and settings using alternate data sources. Second, the current definition of PHU may not be consistent with the operational definition in all PHM. We used a specific definition for PHU (ie, percentile of cost and time period), but that definition may not fit all populations. The risk stratification research community should harmonize the definition of PHU so predictive models of PHUs can be compared accurately to increase their generalizability. Third, we only used demographics, diagnosis, and medications in our prediction models. Past research has shown the value of social determinants of health in improving the prediction of health care utilization [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref42">42</xref>]. Future research should investigate the value of the ensemble model in improving predictive models of PHU that incorporate social data. Finally, the ensemble methodology uses an approach that complicates the explanation of a prediction, and thus the operational use of such models in clinical and PHM settings should be further studied.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>A small segment of the patient population uses most of the health care services over extended periods. We used an ensemble model, a machine learning approach that combines multiple modeling techniques, to simultaneously improve the sensitivity and PPV of predicting PHUs using claims data. Future studies should investigate the value of machine learning techniques in predicting PHUs in other health care settings with potentially different underlying populations and different data sources (eg, EHR data).</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ACG</term>
          <def>
            <p>Adjusted Clinical Group</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CNB</term>
          <def>
            <p>complement naïve Bayes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EDC</term>
          <def>
            <p>expanded diagnostic cluster</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NPV</term>
          <def>
            <p>negative predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">PHM</term>
          <def>
            <p>population health management</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">PHU</term>
          <def>
            <p>persistent high utilizer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">PPV</term>
          <def>
            <p>positive predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">RxMG</term>
          <def>
            <p>Rx-defined morbidity group</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We acknowledge the contributions of Sheri Maxim, Jonathan Thornhill, Jason Lee, Hong Kan, and Tom Richards to this project. This project was funded by the Johns Hopkins Applied Physics Laboratory’s National Health Mission Area Independent Research and Development program.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>HK and MJM codirected the research project. SNH analyzed the data. HYC provided analytical insight and calculated claims costs. HK, MJM, HSB, RR, and JPW reviewed and interpreted the results. HK, SNH, and MJM drafted the manuscript. All authors reviewed and contributed to the final manuscript. HK prepared the manuscript for submission.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Iezzoni</surname>
              <given-names>LI</given-names>
            </name>
          </person-group>
          <source>Risk Adjustment for Measuring Health Care Outcomes, Fourth Edition</source>
          <year>2012</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Health Administration Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gamache</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Magnuson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dixon</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Role of informatics in bridging public and population health</article-title>
          <source>Public Health Informatics and Information Systems</source>
          <year>2020</year>
          <publisher-loc>London</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Whitman</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Vakharia</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ph</surname>
              <given-names>DBT</given-names>
            </name>
            <name name-style="western">
              <surname>Rothberg</surname>
              <given-names>MB</given-names>
            </name>
          </person-group>
          <article-title>High-cost patients: hot-spotters don't explain the half of it</article-title>
          <source>J Gen Intern Med</source>
          <year>2017</year>
          <month>01</month>
          <volume>32</volume>
          <issue>1</issue>
          <fpage>28</fpage>
          <lpage>34</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27480529"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-016-3790-3</pub-id>
          <pub-id pub-id-type="medline">27480529</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11606-016-3790-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC5215147</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Leff</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lemke</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Bodycombe</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Identifying consistent high-cost users in a health plan: comparison of alternative prediction models</article-title>
          <source>Med Care</source>
          <year>2016</year>
          <month>09</month>
          <volume>54</volume>
          <issue>9</issue>
          <fpage>852</fpage>
          <lpage>859</lpage>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000000566</pub-id>
          <pub-id pub-id-type="medline">27326548</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guilcher</surname>
              <given-names>SJT</given-names>
            </name>
            <name name-style="western">
              <surname>Bronskill</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wodchis</surname>
              <given-names>WP</given-names>
            </name>
          </person-group>
          <article-title>Who are the high-cost users? A method for person-centred attribution of health care spending</article-title>
          <source>PLoS One</source>
          <year>2016</year>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>e0149179</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0149179"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0149179</pub-id>
          <pub-id pub-id-type="medline">26937955</pub-id>
          <pub-id pub-id-type="pii">PONE-D-15-32254</pub-id>
          <pub-id pub-id-type="pmcid">PMC4777563</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hwang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>LaClair</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Camacho</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Paz</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Persistent high utilization in a privately insured population</article-title>
          <source>Am J Manag Care</source>
          <year>2015</year>
          <month>04</month>
          <volume>21</volume>
          <issue>4</issue>
          <fpage>309</fpage>
          <lpage>316</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ajmc.com/pubMed.php?pii=86070"/>
          </comment>
          <pub-id pub-id-type="medline">26014469</pub-id>
          <pub-id pub-id-type="pii">86070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chee</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Almenoff</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zulman</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>TH</given-names>
            </name>
          </person-group>
          <article-title>Persistence of high health care costs among VA patients</article-title>
          <source>Health Serv Res</source>
          <year>2018</year>
          <month>10</month>
          <volume>53</volume>
          <issue>5</issue>
          <fpage>3898</fpage>
          <lpage>3916</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29862504"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/1475-6773.12989</pub-id>
          <pub-id pub-id-type="medline">29862504</pub-id>
          <pub-id pub-id-type="pmcid">PMC6153161</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sterling</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Weisner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Grant</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pruzansky</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bui</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Madvig</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pearl</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Association of behavioral health factors and social determinants of health with high and persistently high healthcare costs</article-title>
          <source>Prev Med Rep</source>
          <year>2018</year>
          <month>09</month>
          <volume>11</volume>
          <fpage>154</fpage>
          <lpage>159</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2211-3355(18)30113-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.pmedr.2018.06.017</pub-id>
          <pub-id pub-id-type="medline">30003015</pub-id>
          <pub-id pub-id-type="pii">S2211-3355(18)30113-X</pub-id>
          <pub-id pub-id-type="pmcid">PMC6039851</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Delcher</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shenkman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ranka</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Machine learning approaches for predicting high cost high need patient expenditures in health care</article-title>
          <source>Biomed Eng Online</source>
          <year>2018</year>
          <month>11</month>
          <day>20</day>
          <volume>17</volume>
          <issue>Suppl 1</issue>
          <fpage>131</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://biomedical-engineering-online.biomedcentral.com/articles/10.1186/s12938-018-0568-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12938-018-0568-3</pub-id>
          <pub-id pub-id-type="medline">30458798</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12938-018-0568-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC6245495</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>SHX</given-names>
            </name>
            <name name-style="western">
              <surname>Rahman</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ang</surname>
              <given-names>IYH</given-names>
            </name>
            <name name-style="western">
              <surname>Sridharan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ramachandran</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Khoo</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Toh</surname>
              <given-names>SES</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>XQ</given-names>
            </name>
          </person-group>
          <article-title>Characterising and predicting persistent high-cost utilisers in healthcare: a retrospective cohort study in Singapore</article-title>
          <source>BMJ Open</source>
          <year>2020</year>
          <month>01</month>
          <day>06</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>e031622</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmjopen.bmj.com/lookup/pmidlookup?view=long&#38;pmid=31911514"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2019-031622</pub-id>
          <pub-id pub-id-type="medline">31911514</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2019-031622</pub-id>
          <pub-id pub-id-type="pmcid">PMC6955475</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Iyengar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kho</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Falconer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Docherty</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Yuen</surname>
              <given-names>GY</given-names>
            </name>
          </person-group>
          <article-title>Predicting future high-cost schizophrenia patients using high-dimensional administrative data</article-title>
          <source>Front Psychiatry</source>
          <year>2017</year>
          <volume>8</volume>
          <fpage>114</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3389/fpsyt.2017.00114"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fpsyt.2017.00114</pub-id>
          <pub-id pub-id-type="medline">28713293</pub-id>
          <pub-id pub-id-type="pmcid">PMC5491596</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wodchis</surname>
              <given-names>WP</given-names>
            </name>
            <name name-style="western">
              <surname>Austin</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>A 3-year study of high-cost users of health care</article-title>
          <source>CMAJ</source>
          <year>2016</year>
          <month>02</month>
          <day>16</day>
          <volume>188</volume>
          <issue>3</issue>
          <fpage>182</fpage>
          <lpage>188</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cmaj.ca/cgi/pmidlookup?view=long&#38;pmid=26755672"/>
          </comment>
          <pub-id pub-id-type="doi">10.1503/cmaj.150064</pub-id>
          <pub-id pub-id-type="medline">26755672</pub-id>
          <pub-id pub-id-type="pii">cmaj.150064</pub-id>
          <pub-id pub-id-type="pmcid">PMC4754179</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <article-title>Johns Hopkins ACGs System, version 12</article-title>
          <source>Johns Hopkins School of Public Health</source>
          <year>2019</year>
          <access-date>2022-02-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.hopkinsacg.org/">https://www.hopkinsacg.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Starfield</surname>
              <given-names>BH</given-names>
            </name>
            <name name-style="western">
              <surname>Steinwachs</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Mumford</surname>
              <given-names>LM</given-names>
            </name>
          </person-group>
          <article-title>Development and application of a population-oriented measure of ambulatory care case-mix</article-title>
          <source>Med Care</source>
          <year>1991</year>
          <month>05</month>
          <volume>29</volume>
          <issue>5</issue>
          <fpage>452</fpage>
          <lpage>472</lpage>
          <pub-id pub-id-type="doi">10.1097/00005650-199105000-00006</pub-id>
          <pub-id pub-id-type="medline">1902278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhi-Hua</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <source>Ensemble Methods: Foundations and Algorithms, 1st Edition</source>
          <year>2012</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Chapman and Hall/CRC</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Qiu</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Class-imbalanced deep learning via a class-balanced ensemble</article-title>
          <source>IEEE Trans Neural Netw Learn Syst</source>
          <year>2021</year>
          <month>04</month>
          <day>26</day>
          <fpage>1</fpage>
          <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3071122</pub-id>
          <pub-id pub-id-type="medline">33900923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Predicting hospital readmission: a joint ensemble-learning model</article-title>
          <source>IEEE J Biomed Health Inform</source>
          <year>2020</year>
          <month>02</month>
          <volume>24</volume>
          <issue>2</issue>
          <fpage>447</fpage>
          <lpage>456</lpage>
          <pub-id pub-id-type="doi">10.1109/JBHI.2019.2938995</pub-id>
          <pub-id pub-id-type="medline">31484143</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rennie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shih</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Teevan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Karger</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Tackling the poor assumptions of naive Bayes text classifiers</article-title>
          <source>Proc 20th Int Conf Mach Learn</source>
          <year>2003</year>
          <volume>3</volume>
          <fpage>616</fpage>
          <lpage>623</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Breiman</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Random forests</article-title>
          <source>Machine Learning</source>
          <year>2001</year>
          <volume>45</volume>
          <fpage>5</fpage>
          <lpage>32</lpage>
          <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Flennerhag</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>ML-Ensemble: high performance ensemble learning in Python</source>
          <access-date>2022-02-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://ml-ensemble.com/">http://ml-ensemble.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ramachandran</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>McShea</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Howson</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Burkom</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Assessing the value of unsupervised clustering in predicting persistent high health care utilizers: retrospective analysis of insurance claims data</article-title>
          <source>JMIR Med Inform</source>
          <year>2021</year>
          <month>11</month>
          <day>25</day>
          <volume>9</volume>
          <issue>11</issue>
          <fpage>e31442</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2021/11/e31442/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/31442</pub-id>
          <pub-id pub-id-type="medline">34592712</pub-id>
          <pub-id pub-id-type="pii">v9i11e31442</pub-id>
          <pub-id pub-id-type="pmcid">PMC8663459</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pandya</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Electronic health record-based risk stratification: a potential key ingredient to achieving value-based care</article-title>
          <source>Popul Health Manag</source>
          <year>2021</year>
          <month>06</month>
          <day>14</day>
          <volume>24</volume>
          <issue>6</issue>
          <fpage>654</fpage>
          <lpage>656</lpage>
          <pub-id pub-id-type="doi">10.1089/pop.2021.0131</pub-id>
          <pub-id pub-id-type="medline">34129398</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Gallagher</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Knudson</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Comparing population-based risk-stratification model performance using demographic, diagnosis and medication data extracted from outpatient electronic health records versus administrative claims</article-title>
          <source>Med Care</source>
          <year>2017</year>
          <month>08</month>
          <volume>55</volume>
          <issue>8</issue>
          <fpage>789</fpage>
          <lpage>796</lpage>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000000754</pub-id>
          <pub-id pub-id-type="medline">28598890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>A practical comparison between the predictive power of population-based risk stratification models using data from electronic health records versus administrative claims: setting a baseline for future EHR-derived risk stratification models</article-title>
          <source>Med Care</source>
          <year>2018</year>
          <month>12</month>
          <volume>56</volume>
          <issue>2</issue>
          <fpage>202</fpage>
          <lpage>203</lpage>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000000849</pub-id>
          <pub-id pub-id-type="medline">29200132</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Lowe</surname>
              <given-names>KB</given-names>
            </name>
            <name name-style="western">
              <surname>Huerta</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Ford</surname>
              <given-names>EW</given-names>
            </name>
          </person-group>
          <article-title>Forecasting the maturation of electronic health record functions among US hospitals: retrospective analysis and predictive model</article-title>
          <source>J Med Internet Res</source>
          <year>2018</year>
          <month>08</month>
          <day>07</day>
          <volume>20</volume>
          <issue>8</issue>
          <fpage>e10458</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2018/8/e10458/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/10458</pub-id>
          <pub-id pub-id-type="medline">30087090</pub-id>
          <pub-id pub-id-type="pii">v20i8e10458</pub-id>
          <pub-id pub-id-type="pmcid">PMC6104443</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Shermock</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Elder Dalpoas</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>J Kan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Alexander</surname>
              <given-names>GC</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the impact of prescription fill rates on risk stratification model performance</article-title>
          <source>Med Care</source>
          <year>2017</year>
          <month>12</month>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>1052</fpage>
          <lpage>1060</lpage>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000000825</pub-id>
          <pub-id pub-id-type="medline">29036011</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Comparing the predictive effects of patient medication adherence indices in electronic health record and claims-based risk stratification models</article-title>
          <source>Popul Health Manag</source>
          <year>2021</year>
          <month>10</month>
          <volume>24</volume>
          <issue>5</issue>
          <fpage>601</fpage>
          <lpage>609</lpage>
          <pub-id pub-id-type="doi">10.1089/pop.2020.0306</pub-id>
          <pub-id pub-id-type="medline">33544044</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kan</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shermock</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Alexander</surname>
              <given-names>GC</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Integrating e-prescribing and pharmacy claims data for predictive modeling: comparing costs and utilization of health plan members who fill their initial medications with those who do not</article-title>
          <source>J Manag Care Spec Pharm</source>
          <year>2020</year>
          <month>10</month>
          <volume>26</volume>
          <issue>10</issue>
          <fpage>1282</fpage>
          <lpage>1290</lpage>
          <pub-id pub-id-type="doi">10.18553/jmcp.2020.26.10.1282</pub-id>
          <pub-id pub-id-type="medline">32996394</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Heins</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Gudzune</surname>
              <given-names>KA</given-names>
            </name>
          </person-group>
          <article-title>Assessing the impact of body mass index information on the performance of risk adjustment models in predicting health care costs and utilization</article-title>
          <source>Med Care</source>
          <year>2018</year>
          <month>12</month>
          <volume>56</volume>
          <issue>12</issue>
          <fpage>1042</fpage>
          <lpage>1050</lpage>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000001001</pub-id>
          <pub-id pub-id-type="medline">30339574</pub-id>
          <pub-id pub-id-type="pmcid">PMC6231962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Gudzune</surname>
              <given-names>KA</given-names>
            </name>
          </person-group>
          <article-title>Assessing the added value of blood pressure information derived from electronic health records in predicting health care cost and utilization</article-title>
          <source>Popul Health Manag</source>
          <year>2021</year>
          <month>11</month>
          <day>29</day>
          <fpage>250</fpage>
          <pub-id pub-id-type="doi">10.1089/pop.2021.0250</pub-id>
          <pub-id pub-id-type="medline">34847729</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lemke</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Gudzune</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Assessing markers from ambulatory laboratory tests for predicting high-risk patients</article-title>
          <source>Am J Manag Care</source>
          <year>2018</year>
          <month>06</month>
          <day>01</day>
          <volume>24</volume>
          <issue>6</issue>
          <fpage>e190</fpage>
          <lpage>e195</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ajmc.com/pubMed.php?pii=87574"/>
          </comment>
          <pub-id pub-id-type="medline">29939509</pub-id>
          <pub-id pub-id-type="pii">87574</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kan</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Leff</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kimura</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anzaldi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lasser</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Defining and assessing geriatric risk factors and associated health care utilization among older adults using claims and electronic health records</article-title>
          <source>Med Care</source>
          <year>2018</year>
          <month>12</month>
          <volume>56</volume>
          <issue>3</issue>
          <fpage>233</fpage>
          <lpage>239</lpage>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000000865</pub-id>
          <pub-id pub-id-type="medline">29438193</pub-id>
          <pub-id pub-id-type="pii">00005650-201803000-00007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Anzaldi</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Leff</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kimura</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>The value of unstructured electronic health record data in geriatric syndrome case identification</article-title>
          <source>J Am Geriatr Soc</source>
          <year>2018</year>
          <month>08</month>
          <volume>66</volume>
          <issue>8</issue>
          <fpage>1499</fpage>
          <lpage>1507</lpage>
          <pub-id pub-id-type="doi">10.1111/jgs.15411</pub-id>
          <pub-id pub-id-type="medline">29972595</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Scharfstein</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Prospective EHR-based clinical trials: the challenge of missing data</article-title>
          <source>J Gen Intern Med</source>
          <year>2014</year>
          <month>07</month>
          <volume>29</volume>
          <issue>7</issue>
          <fpage>976</fpage>
          <lpage>978</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24839057"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-014-2883-0</pub-id>
          <pub-id pub-id-type="medline">24839057</pub-id>
          <pub-id pub-id-type="pmcid">PMC4061350</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Rahman</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ang</surname>
              <given-names>IYH</given-names>
            </name>
            <name name-style="western">
              <surname>Sridharan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ramachandran</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Toh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>XQ</given-names>
            </name>
          </person-group>
          <article-title>Characterization of high healthcare utilizer groups using administrative data from an electronic medical record database</article-title>
          <source>BMC Health Serv Res</source>
          <year>2019</year>
          <month>07</month>
          <day>05</day>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>452</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmchealthservres.biomedcentral.com/articles/10.1186/s12913-019-4239-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12913-019-4239-2</pub-id>
          <pub-id pub-id-type="medline">31277649</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12913-019-4239-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC6612067</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kan</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bodycombe</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lemke</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Exploring the use of machine learning for risk adjustment: a comparison of standard and penalized linear regression models in predicting health care costs in older adults</article-title>
          <source>PLoS One</source>
          <year>2019</year>
          <volume>14</volume>
          <issue>3</issue>
          <fpage>e0213258</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0213258"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0213258</pub-id>
          <pub-id pub-id-type="medline">30840682</pub-id>
          <pub-id pub-id-type="pii">PONE-D-18-17810</pub-id>
          <pub-id pub-id-type="pmcid">PMC6402678</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gamache</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Public and population health informatics: the bridging of big data to benefit communities</article-title>
          <source>Yearb Med Inform</source>
          <year>2018</year>
          <month>08</month>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>199</fpage>
          <lpage>206</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thieme-connect.com/DOI/DOI?10.1055/s-0038-1667081"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0038-1667081</pub-id>
          <pub-id pub-id-type="medline">30157524</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hatef</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Rouhizadeh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Assessing the impact of social needs and social determinants of health on health care utilization: using patient- and community-level data</article-title>
          <source>Popul Health Manag</source>
          <year>2021</year>
          <month>04</month>
          <volume>24</volume>
          <issue>2</issue>
          <fpage>222</fpage>
          <lpage>230</lpage>
          <pub-id pub-id-type="doi">10.1089/pop.2020.0043</pub-id>
          <pub-id pub-id-type="medline">32598228</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hatef</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Impact of area deprivation index on the performance of claims-based risk-adjustment models in predicting health care costs and utilization</article-title>
          <source>Popul Health Manag</source>
          <year>2021</year>
          <month>06</month>
          <volume>24</volume>
          <issue>3</issue>
          <fpage>403</fpage>
          <lpage>411</lpage>
          <pub-id pub-id-type="doi">10.1089/pop.2020.0135</pub-id>
          <pub-id pub-id-type="medline">33434448</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hatef</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sylling</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lasser</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Searle</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Predmore</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Batten</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Curtis</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Fihn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>The association between neighborhood socioeconomic and housing characteristics with hospitalization: results of a national study of veterans</article-title>
          <source>J Am Board Fam Med</source>
          <year>2019</year>
          <volume>32</volume>
          <issue>6</issue>
          <fpage>890</fpage>
          <lpage>903</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jabfm.org/cgi/pmidlookup?view=long&#38;pmid=31704758"/>
          </comment>
          <pub-id pub-id-type="doi">10.3122/jabfm.2019.06.190138</pub-id>
          <pub-id pub-id-type="medline">31704758</pub-id>
          <pub-id pub-id-type="pii">32/6/890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hatef</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Taghipour</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vyas</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gottlieb</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Including social and behavioral determinants in predictive models: trends, challenges, and opportunities</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>09</month>
          <day>08</day>
          <volume>8</volume>
          <issue>9</issue>
          <fpage>e18084</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/9/e18084/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/18084</pub-id>
          <pub-id pub-id-type="medline">32897240</pub-id>
          <pub-id pub-id-type="pii">v8i9e18084</pub-id>
          <pub-id pub-id-type="pmcid">PMC7509627</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vest</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Adler-Milstein</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gottlieb</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Bian</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Campion</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>Donnelly</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Harper</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Huerta</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Kansky</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Khurshid</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kooreman</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>McDonnell</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Overhage</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Pantell</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Parisi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Shenkman</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Tierney</surname>
              <given-names>WM</given-names>
            </name>
            <name name-style="western">
              <surname>Wiehe</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Harle</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>Assessment of structured data elements for social risk factors</article-title>
          <source>Am J Manag Care</source>
          <year>2022</year>
          <month>01</month>
          <day>01</day>
          <volume>28</volume>
          <issue>1</issue>
          <fpage>e14</fpage>
          <lpage>e23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ajmc.com/pubMed.php?pii=88816"/>
          </comment>
          <pub-id pub-id-type="doi">10.37765/ajmc.2022.88816</pub-id>
          <pub-id pub-id-type="medline">35049262</pub-id>
          <pub-id pub-id-type="pii">88816</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
