<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v6i1e17</article-id>
    <article-id pub-id-type="pmid">29572199</article-id>
    <article-id pub-id-type="doi">10.2196/medinform.8611</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Assessing the Readability of Medical Documents: A Ranking Approach</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Leroy</surname>
          <given-names>Gondy</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Adams</surname>
          <given-names>Nikki</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Brigo</surname>
          <given-names>Francesco</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1">
        <name name-style="western">
          <surname>Zheng</surname>
          <given-names>Jiaping</given-names>
        </name>
        <degrees>MS</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-7662-810X</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib2" corresp="yes">
      <name name-style="western">
        <surname>Yu</surname>
        <given-names>Hong</given-names>
      </name>
      <degrees>PhD, FACMI</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <xref rid="aff2" ref-type="aff">2</xref>
      <address>
        <institution>Center for Healthcare Organization and Implementation Research</institution>
        <institution>Bedford Veterans Affairs Medical Center</institution>
        <addr-line>200 Springs Road</addr-line>
        <addr-line>Bedford, MA, 01730</addr-line>
        <country>United States</country>
        <phone>1 781 687 2000</phone>
        <fax>1 781 687 2000</fax>
        <email>hong.yu@umassmed.edu</email>
      </address>  
      <xref rid="aff3" ref-type="aff">3</xref>
      <xref rid="aff4" ref-type="aff">4</xref>
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-9263-5035</ext-link></contrib>
    </contrib-group>
    <aff id="aff1">
    <sup>1</sup>
    <institution>College of Information and Computer Sciences</institution>
    <institution>University of Massachusetts</institution>  
    <addr-line>Amherst, MA</addr-line>
    <country>United States</country></aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>Center for Healthcare Organization and Implementation Research</institution>
    <institution>Bedford Veterans Affairs Medical Center</institution>  
    <addr-line>Bedford, MA</addr-line>
    <country>United States</country></aff>
    <aff id="aff3">
    <sup>3</sup>
    <institution>Department of Computer Science</institution>
    <institution>University of Massachusetts</institution>  
    <addr-line>Lowell, MA</addr-line>
    <country>United States</country></aff>
    <aff id="aff4">
    <sup>4</sup>
    <institution>Department of Medicine</institution>
    <institution>University of Massachusetts Medical School</institution>  
    <addr-line>Worcester, MA</addr-line>
    <country>United States</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Hong Yu 
      <email>hong.yu@umassmed.edu</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><season>Jan-Mar</season><year>2018</year></pub-date>
    <pub-date pub-type="epub">
      <day>23</day>
      <month>03</month>
      <year>2018</year>
    </pub-date>
    <volume>6</volume>
    <issue>1</issue>
    <elocation-id>e17</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>3</day>
        <month>8</month>
        <year>2017</year>
      </date>
      <date date-type="rev-request">
        <day>20</day>
        <month>11</month>
        <year>2017</year>
      </date>
      <date date-type="rev-recd">
        <day>12</day>
        <month>1</month>
        <year>2018</year>
      </date>
      <date date-type="accepted">
        <day>31</day>
        <month>1</month>
        <year>2018</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Jiaping Zheng, Hong Yu. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 23.03.2018.</copyright-statement>
    <copyright-year>2018</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://medinform.jmir.org/2018/1/e17/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>The use of electronic health record (EHR) systems with patient engagement capabilities, including viewing, downloading, and transmitting health information, has recently grown tremendously. However, using these resources to engage patients in managing their own health remains challenging due to the complex and technical nature of the EHR narratives.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>Our objective was to develop a machine learning–based system to assess readability levels of complex documents such as EHR notes.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>We collected difficulty ratings of EHR notes and Wikipedia articles using crowdsourcing from 90 readers. We built a supervised model to assess readability based on relative orders of text difficulty using both surface text features and word embeddings. We evaluated system performance using the Kendall coefficient of concordance against human ratings.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>Our system achieved significantly higher concordance (.734) with human annotators than did a baseline using the Flesch-Kincaid Grade Level, a widely adopted readability formula (.531). The improvement was also consistent across different disease topics. This method’s concordance with an individual human user’s ratings was also higher than the concordance between different human annotators (.658).</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>We explored methods to automatically assess the readability levels of clinical narratives. Our ranking-based system using simple textual features and easy-to-learn word embeddings outperformed a widely used readability formula. Our ranking-based method can predict relative difficulties of medical documents. It is not constrained to a predefined set of readability levels, a common design in many machine learning–based systems. Furthermore, the feature set does not rely on complex processing of the documents. One potential application of our readability ranking is personalization, allowing patients to better accommodate their own background knowledge.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>electronic health records</kwd>
      <kwd>readability</kwd>
      <kwd>comprehension</kwd>
      <kwd>machine learning</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Research has demonstrated that actively involving patients in the management of their own health can lead to better outcomes, and potentially lower costs [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Patient engagement [<xref ref-type="bibr" rid="ref3">3</xref>]—a concept that includes patient activation, and interventions designed to increase activation and promote positive patient behavior—has thus emerged as an important component of strategies to improve health care. A growing body of evidence has accumulated on better health outcomes and care experiences associated with higher engagement. For example, patients with chronic diseases who have high patient activation measure scores are more likely to practice self-management behaviors and report high medication adherence [<xref ref-type="bibr" rid="ref4">4</xref>]. High patient activation measure scores are also associated with a high likelihood of clinical indicators (eg, hemoglobin A<sub>1c</sub>, high-density lipoprotein, and triglycerides) being in the normal range [<xref ref-type="bibr" rid="ref1">1</xref>].</p>
        <p>The use of electronic health record (EHR) systems with patient engagement capabilities, including viewing, downloading, and transmitting health information, has recently grown tremendously. According to data from the US Office of the National Coordinator for Health Information Technology, the percentage of hospitals that enable patients to electronically view, download, and transmit their health information grew almost 7-fold between 2013 and 2015 [<xref ref-type="bibr" rid="ref5">5</xref>]. In 2015, 95% of hospitals provided their patients with the ability to view their information.</p>
        <p>However, actively engaging patients in the management of their own health remains challenging, despite the evidence of better health care outcomes and potentially lower costs. Access to EHRs by itself is not sufficient to motivate patients to be involved because of the complex and technical nature of the EHR. Patients without training in medicine may struggle to process and understand the information buried in the technical language in EHRs. In fact, materials beyond patients’ reading abilities are widely reported in the literature [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. The lack of explanation that an expert can provide when reading EHR notes may also engender unnecessary anxiety or confusion [<xref ref-type="bibr" rid="ref11">11</xref>]. Furthermore, many patients have limited health literacy and are not proficient in completing tasks considered essential to successfully navigate the health system and act on health information [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        <p>Therefore, assessing the difficulty of EHR notes and integrating appropriate educational assistance in EHR systems may make them more accessible for a layperson without professional training in medicine. In this study, we explored methods to automatically assess the readability levels of clinical narratives in EHRs and other complex documents. An accurate assessment of these documents can be used to match patients’ literacy levels, facilitating patient activation and engagement.</p>
      </sec>
      <sec>
        <title>Prior Work</title>
        <p>The research community has relied on readability formulas to assess a variety of information materials for patients. Numerous readability metrics have been developed to assess the grade level or the number of years of education needed for a person to understand the content. One of the most widely used in the health domain is the Flesch-Kincaid Grade Level [<xref ref-type="bibr" rid="ref13">13</xref>] (FKGL), which predicts a grade level based on the average sentence length and the average word length. Other similar metrics are the Simple Measure of Gobbledygook, Gunning Fog Index, Coleman-Liau Index, and New Dale-Chall formula. These metrics rely on the assumption that the longer the words and the sentences are, the more difficult the text is. However, this assumption does not hold for EHR narratives, as sentences are usually short and abbreviations are common.</p>
        <p>There were also efforts in the health care domain to develop instruments for medical documents. One measurement proposed by Kim et al [<xref ref-type="bibr" rid="ref14">14</xref>] compared differences in surface text, syntactic features, and semantic features with a known set of easy and difficult documents and reported normalized scores. Another method for health text was based on a naive Bayes classifier [<xref ref-type="bibr" rid="ref15">15</xref>]. Those authors collected training documents from blogs, patient education documents, and medical journals. They used vocabularies in these documents as features for the classifier. Both of the methods relied on manually curated documents.</p>
      </sec>
      <sec>
        <title>Goal of This Work</title>
        <p>In this work, we considered measuring readability as a ranking task, where the relative difficulty of documents is compared. Readability in the health domain is often measured with formulas developed to ensure that school textbooks are appropriate for children at a particular school grade level [<xref ref-type="bibr" rid="ref16">16</xref>]. However, obtaining a grade level often is not the ultimate goal. The document’s grade level is usually compared with a person’s educational level or another document’s grade level in order to find appropriate reading materials. The number of years of education has been challenged as a proxy measure for one’s educational experiences when measuring cognitive functions. One study has shown that, in a sample of elderly African Americans, nearly 30% read 3 or more years below their self-reported educational level [<xref ref-type="bibr" rid="ref17">17</xref>]. Other studies have also advocated the use of reading or literacy ability instead of years of education to account for variance in neuropsychological assessments [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>Therefore, ranking the readability of documents is well suited to applications whose main concern is to match difficulty levels with existing text or to identify easier or more difficult ones, rather than to obtain an absolute score. For example, a patient-facing EHR system may learn from its users’ reactions to infer their reading ability and present appropriate educational materials. Such a system can be personalized for an individual user. A user with limited literacy will only see straightforward materials, whereas higher-quality materials that require higher literacy levels can be presented to an advanced user. This personalization is a first step toward user-centered care. To this end, we developed a machine learning model to compare the relative difficulty of documents using data collected from Amazon Mechanical Turk (AMT) users. A demonstration website is available [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data</title>
        <p>We collected difficulty levels on health-related documents from human annotators. We recruited users on AMT (Amazon.com, Inc, Seattle, WA, USA) to read and rate pairs of documents based on their perceived difficulty. We screened AMT users to be from the United States and having an approval rating of at least 95% in prior tasks. Each reader was presented with 20 randomly selected pairs of documents side by side on the computer screen. The readers were requested to rate the readability of the documents on a scale from 1 (easiest to understand) to 10 (most difficult to understand). The setup to show 2 documents helped reduce variation when we assembled the ratings into a complete ranking, as it provided explicit partial ranking, as opposed to implicit order inferred from the difficulty ratings.</p>
        <p>The 2 documents in each document pair were of similar length (within a 50-token difference, where a token is a word or term) and comparable difficulty according to FKGL (within 0.5 grade level). We sourced the documents from English Wikipedia articles and deidentified EHR notes written by physicians. The 20 document pairs consisted of 5 pairs of Wikipedia documents, 5 pairs of EHR documents, and 10 pairs of mixed-source documents.</p>
        <p>We selected 3 common diseases as topics from the document sources: cancer, diabetes, and hypertension. Wikipedia documents were randomly selected from all article pages up to 3 levels under the disease category page, following the category structure. EHR notes were selected using <italic>International Classification of Diseases, Ninth Revision</italic> codes (140-195 for cancer, 250.00-250.93 for diabetes, and 401.0-401.9 for hypertension). For each disease topic, we collected data from 30 AMT users. In total, 90 AMT users annotated 900 document pairs, with 927 of the documents being unique. <xref ref-type="table" rid="table1">Table 1</xref> shows the statistics of the documents annotated by these users.</p>
      </sec>
      <sec>
        <title>Machine Learning System</title>
        <sec>
          <title>Learning to Rank</title>
          <p>We developed a supervised learning system for EHR readability. Traditionally, readability is measured at school grade levels. Formulas that are widely used in the health care domain include the FKGL, Simple Measure of Gobbledygook, Gunning Fog Index, Coleman-Liau Index, and New Dale-Chall formula. They all use a limited number of factors, mostly word and sentence lengths, to estimate a document’s grade level. These simple features, however, are not able to fully capture the complexity of medical documents when used alone as in the formulas. For instance, EHR narratives often contain abbreviations and lists, which are treated as short words and sentences, thus lowering the estimated grade level. However, the abbreviations present a great challenge to a layperson’s understanding [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p>
          <p>In the machine learning community, many systems were developed to classify documents into a predefined set of readability levels. Such systems can include a multitude of features, including lexical, syntactic, and discourse features. These methods are nevertheless constrained in the granularity that they can estimate, since the predefined difficulty levels are often limited.</p>
          <p>In our work, we approached readability as a ranking problem, in which the difficulty levels between documents are compared. This approach overcomes the problems in both the traditional formulas and the classification methods: we are not solely reliant on word and sentence lengths as in the formulas, and our approach can order readability levels for a set of documents.</p>
          <p>We trained our ranking system using a pairwise approach. From each user’s documents, we generated a training example from any 2 documents that were assigned different difficulty levels.</p>
          <p>A support vector machine (SVM) model was learned from the pairwise comparisons of AMT users’ assigned document difficulty levels using the SVM<sup>rank</sup> package [<xref ref-type="bibr" rid="ref23">23</xref>]. SVM models normally optimize a hinge loss function based on a binary label for every training example. In the pairwise scenario, the objective is to minimize the number of discordant pairs—that is, pairs that are ordered incorrectly with respect to the true order. More formally, given a set of training examples {(<bold>x</bold><sub>i</sub>, y<sub>i</sub>)}, the primal form of the problem is as the equation in <xref ref-type="fig" rid="figure1">Figure 1</xref> shows, where <bold>w</bold> is the weight vector, <italic>C</italic> parameterizes the trade-off between training error and margin size, and ξ is slack variables. Rearranging the first constraint, <bold>w</bold><sup>T</sup>(<bold>x</bold><sub>i</sub>–<bold>x</bold><sub>j</sub>)&#62;1–ξ<sub>i,j</sub>, which is equivalent to a classic SVM problem on the modified input vectors <bold>x</bold>′= <bold>x</bold><sub>i</sub>–<bold>x</bold><sub>j</sub>. Therefore, a binary classification SVM optimizer can be used to solve the problem.</p>
          <p>In our dataset, we generated pairwise difference vectors <bold>x</bold>′ from each AMT user’s ratings. The difference vectors were not generated from different users because ratings across users may not form a consistent ranking, as those from a single user do. For example, a vector was generated from 2 documents, A and B, by 1 user, but not from 2 documents from different users.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Statistics of documents annotated by readers.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="270"/>
              <col width="250"/>
              <col width="250"/>
              <col width="200"/>
              <thead>
                <tr valign="bottom">
                  <td colspan="2">Source and disease</td>
                  <td>Documents (n)</td>
                  <td>Sentences (n)</td>
                  <td>Tokens<sup>a</sup> (n)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="5"><bold>Wikipedia</bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Cancer</td>
                  <td>215</td>
                  <td>2510</td>
                  <td>46,349</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Diabetes</td>
                  <td>74</td>
                  <td>1352</td>
                  <td>33,402</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Hypertension</td>
                  <td>85</td>
                  <td>2007</td>
                  <td>45,440</td>
                </tr>
                <tr valign="top">
                  <td colspan="5"><bold>EHR<sup>b</sup></bold><bold>notes</bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Cancer</td>
                  <td>127</td>
                  <td>2067</td>
                  <td>37,830</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Diabetes</td>
                  <td>195</td>
                  <td>6335</td>
                  <td>81,085</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Hypertension</td>
                  <td>231</td>
                  <td>6594</td>
                  <td>90,784</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Total</td>
                  <td>927</td>
                  <td>20,865</td>
                  <td>334,890</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>A token is, loosely, a word or term.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>EHR: electronic health record.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>The primal form of pairwise ranking.</p>
            </caption>
            <graphic xlink:href="medinform_v6i1e17_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Features</title>
          <p>We employed several types of features, including those from traditional readability formulas. We included average words per sentence, average syllables per word from the FKGL formula, proportion of polysyllabic words (words with more than 3 syllables) from the Gunning Fog Index, and percentage of difficult words from the New Dale-Chall formula. Although these formulas do not correlate well with human perceptions of difficulty [<xref ref-type="bibr" rid="ref24">24</xref>], these word length–based features are useful at capturing some longer medical jargon (eg, Huntington disease). There is also evidence that the perceived difficulty of a word is correlated with its length [<xref ref-type="bibr" rid="ref25">25</xref>]. We also included word frequency obtained from the Wikipedia documents and EHR notes, since common words have been found likely to be perceived as easier to understand [<xref ref-type="bibr" rid="ref25">25</xref>]. We grouped the frequencies into 10 bins and used the percentage of words in each bin as features. Additional features included document length measured in words and sentences. Long documents require more cognitive processing to comprehend, which might translate to higher perceived difficulty. Lastly, we captured language patterns using 2 word embeddings learned separately from Wikipedia documents and deidentified EHR notes. We used Word2vec [<xref ref-type="bibr" rid="ref26">26</xref>] to learn a 200-dimensional skip-gram embedding.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>System Performance</title>
        <p>We split the annotated data three ways, into training (60%), development (20%), and test (20%) sets. The 3 disease topics were stratified in the split. Hyperparameters were optimized on the development set. We obtained final test results from a model trained using the optimized parameters.</p>
        <p>We evaluated our system using the Kendall coefficient of concordance (<italic>W</italic>) [<xref ref-type="bibr" rid="ref27">27</xref>], a statistic that measures the agreement between rankings from multiple raters. The coefficient aggregates the ranks assigned to each item from all raters and measures the variance. The variance is then normalized to be between 0 and 1. Higher values represent a high level of concordance. In our experiments, for each AMT user, we ordered his or her documents by their assigned difficulty levels and calculated <italic>W</italic> with the order generated from our system prediction. We then averaged the <italic>W</italic> coefficients of all the users.</p>
        <p><xref ref-type="table" rid="table2">Table 2</xref> shows our system’s performance, in the row “new system.” The next rows show different experiment settings discussed in the next two sections. As a baseline, we evaluated the performance of the widely used FKGL readability formula. The average agreement between this formula and the AMT annotators was .531. Our system achieved an agreement of .734 with the AMT annotators, outperforming the FKGL baseline by 38.3%. The increase is statistically significant as assessed by a Wilcoxon signed rank test at the <italic>P</italic>=.05 level.</p>
        <p>We also trained and tested separate models for each of the disease topics following the same process. Our system showed consistent improvement over the baseline across all disease categories. Agreement in the diabetes and hypertension categories increased significantly over the baseline FKGL metric. The cancer category improved substantially, but not significantly, over the baseline. These results suggested that our method is robust across different topics.</p>
       
      </sec>
      <sec>
        <title>User Behavior</title>
        <p>A variety of factors may influence a reader’s reading comprehension, which in turn determines his or her judgment on a document’s difficulty. We examined the differences in the AMT users’ difficulty ratings using the same Kendall <italic>W</italic> coefficient. We calculated <italic>W</italic> for each pair of users’ ranked documents. The average concordance between any 2 users was .658. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the distribution of concordance between any 2 users in our dataset.</p>
        <p>While there are pairs of users whose concordance was low, most (851/1299, 65.51%) had a concordance greater than .6. When examined on an individual level, the low concordance can often be attributed to a few users who appeared to disagree with many others. There were 9 users who had a less than .5 concordance with more than 10 other users. Furthermore, 5 of these users’ mean concordance with other users was less than .5.</p>
        
 <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>System performance (Kendall <italic>W</italic>) compared with baseline for specific disease topics and with partial datasets. Numbers in parentheses are percentage improvements over FKGL (Flesch-Kincaid Grade Level). <italic>P</italic> values are comparisons with FKGL using a Wilcoxon signed rank test.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="240"/>
            <col width="100"/>
            <col width="75"/>
            <col width="100"/>
            <col width="75"/>
            <col width="110"/>
            <col width="75"/>
            <col width="100"/>
            <col width="75"/>
            <thead>
              <tr valign="top">
                <td colspan="2">System</td>
                <td colspan="2">Cancer</td>
                <td colspan="2">Diabetes</td>
                <td colspan="2">Hypertension</td>
                <td colspan="2">All</td>
              </tr>
              <tr valign="top">
                <td colspan="2">System</td>
                <td>Kendall <italic>W</italic></td>
                <td><italic>P</italic> value</td>
                <td>Kendall <italic>W</italic></td>
                <td><italic>P</italic> value</td>
                <td>Kendall <italic>W</italic></td>
                <td><italic>P</italic> value</td>
                <td>Kendall <italic>W</italic></td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">FKGL (baseline)</td>
                <td>.541</td>
                <td><break/></td>
                <td>.490</td>
                <td><break/></td>
                <td>.561</td>
                <td><break/></td>
                <td>.531</td>
                <td><break/></td>
              </tr>
              <tr valign="bottom">
                <td colspan="2">New system</td>
                <td>.656 (+21.3)</td>
                <td>.08</td>
                <td>.790 (+61.3)</td>
                <td>.02</td>
                <td>.715 (+27.5)</td>
                <td>.03</td>
                <td>.734 (+38.3)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="6"><bold>New system with data subsets excluded</bold></td>
              </tr>
              <tr valign="bottom">
                <td><break/></td>
                <td>Excluding eccentric users</td>
                <td>.694 (+28.3)</td>
                <td>.03</td>
                <td>.762 (+55.5)</td>
                <td>.02</td>
                <td>.727 (+29.6)</td>
                <td>.03</td>
                <td>.722 (+36.0)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="bottom">
                <td><break/></td>
                <td>Excluding controversial documents</td>
                <td>.650 (+20.1)</td>
                <td>.05</td>
                <td>.790 (+61.3)</td>
                <td>.02</td>
                <td>.759 (+35.2)</td>
                <td>.02</td>
                <td>.737 (+39.0)</td>
                <td>&#60;.01</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>

        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Histogram of Kendall <italic>W</italic> evaluating readability ratings between any 2 Amazon Mechanical Turk users.</p>
          </caption>
          <graphic xlink:href="medinform_v6i1e17_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>

        <p>To measure a user’s conformity in relation to others, we calculated the mean Kendall <italic>W</italic> between individual users and all of their peers. <xref ref-type="fig" rid="figure3">Figure 3</xref> shows the distribution.</p>
        <p>Approximately one-third of the users were highly conforming (mean <italic>W≥</italic>.7) with others, whereas 7% (6/90) were eccentric (mean <italic>W</italic>&#60;.5). This result suggests that, despite individual differences in their background knowledge about the subject matter, AMT users still exhibited a consensus on a document’s difficulty level. We also noted that our system was able to predict readability orders similar to those of a “regular” user. Our system’s mean <italic>W</italic> was highly correlated with a user’s conformity (ρ=.85). In contrast, the FKGL formula’s predicted grade levels did not show a strong correlation (ρ=–.13) with conformity.</p>
        <p><xref ref-type="table" rid="table2">Table 2</xref> (row “–eccentric users”) shows the performance of models trained from data excluding eccentric users. All disease topics performed significantly better with our system than with FKGL. Our system’s performance on the combined disease topics, also significantly higher than with FKGL, was slightly lower than with the system using the full dataset. This could be due to the large amount of samples removed from training even when we excluded only a small number of users, because the difference vectors were generated from all possible pairwise comparisons. On the individual disease topic level, however, the cancer and hypertension models outperformed our system when trained on the full training data.</p>
      </sec>
      <sec>
        <title>Controversial Documents</title>
        <p>In addition to annotator differences, another factor that contributes to inconsistent annotations is the nature of the documents. We postulated that some documents may have been challenging for the AMT users. For example, certain types of domain-specific writing may appear easy to understand to some but not all users, leading to inconsistent user ratings. These “controversial” documents would also have confused our system, which attempted to learn from the conflicting human annotation. To highlight the range of AMT users’ perceptions of difficulty, <xref ref-type="fig" rid="figure4">Figure 4</xref> shows the maximum difference in ratings assigned by AMT users to documents that were rated by at least two users (n=597).</p>
        <p>The mean difference was 3.8, suggesting that users’ perceptions of difficulty varied considerably. The 2 sources of documents (Wikipedia and EHR notes) contained approximately the same number of controversial documents (maximum difference &#62;5), and the cancer topic had more such documents than the other 2 topics. We further trained new models after removing controversial documents from the dataset. <xref ref-type="table" rid="table2">Table 2</xref> shows the performances of these models in the last row (“Excluding controversial documents”). Performance of 2 categories, cancer and diabetes, remained similar to those of the models trained from the full dataset. The hypertension set increased appreciably.</p>
      </sec>
      <sec>
        <title>Feature Ablation</title>
        <p>We compared the contribution of the different types of features included in our system. We trained separate models without the word frequency–based features, readability formula features, word length–based features, and word embedding–based features. <xref ref-type="table" rid="table3">Table 3</xref> shows the performance of these models.</p>
        <p>Excluding word embeddings resulted in the largest decrease in performance. The word frequency–based features did not appear to contribute much to the overall performance. Removing these features resulted in only a 0.1% performance decrease. This could be due to the nature of the word frequency corpus (a general English corpus without any particular emphasis on any domain) we used to calculate these features. The surface text characteristics captured by the formulas showed a moderate contribution, although they were not reliable stand-alone indicators. With the exception of 1 case, the contributions of the features were consistent across different disease topics—word embedding and word length–based features being the highest and word frequency the lowest.</p>

        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Histogram of individual Amazon Mechanical Turk users' conformity (measured by the mean of Kendall <italic>W</italic> against their peers).</p>
          </caption>
          <graphic xlink:href="medinform_v6i1e17_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>

        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Histogram of maximum differences in Amazon Mechanical Turk users' ratings of documents rated by at least two users.</p>
          </caption>
          <graphic xlink:href="medinform_v6i1e17_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>

        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Model performance (Kendall <italic>W</italic>) with feature ablation.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="310"/>
            <col width="170"/>
            <col width="170"/>
            <col width="170"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Feature set</td>
                <td>Cancer</td>
                <td>Diabetes</td>
                <td>Hypertension</td>
                <td>All</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="bottom">
                <td colspan="2">Full<sup>a</sup></td>
                <td>.656</td>
                <td>.790</td>
                <td>.715</td>
                <td>.734</td>
              </tr>
              <tr valign="top">
                <td colspan="6"><bold>Excluded feature</bold></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Frequency</td>
                <td>.652</td>
                <td>.792</td>
                <td>.710</td>
                <td>.733</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Formula</td>
                <td>.648</td>
                <td>.789</td>
                <td>.709</td>
                <td>.728</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Length</td>
                <td>.636</td>
                <td>.785</td>
                <td>.702</td>
                <td>.716</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Embedding</td>
                <td>.677</td>
                <td>.784</td>
                <td>.703</td>
                <td>.714</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>The system with all proposed features included (data from <xref ref-type="table" rid="table2">Table 2</xref>).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>We explored methods to automatically assess the readability levels of clinical narratives. Our ranking-based system using simple textual features and easy-to-learn word embeddings outperformed predictions from applying FKGL. In all of the disease topics we assessed, our method achieved an over 20% increase, with the majority of cases showing higher and statistically significance increases.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>One limitation of our method is that it may be necessary to prune inconsistent data before training a model. Some users’ perceptions of document readability may exhibit a different pattern from others’. Including conflicting data points may result in suboptimal models. A future study direction is to explore the trade-off between expert and crowdsourced annotations.</p>
        <p>Another limitation is that we trained our model on AMT users’ perceived document difficulty, which can be different from a linguistic perspective.</p>
      </sec>
      <sec>
        <title>Comparison With Other Methods</title>
        <p>We applied a learning-to-rank approach to readability assessment, whereby we used comparisons of relative difficulty to train a model and, similarly, to predict an order based on document difficulty. Existing machine learning–based systems are usually designed around classification. They are often limited to a few predefined labels [<xref ref-type="bibr" rid="ref15">15</xref>] or require corpora labeled at distinct levels [<xref ref-type="bibr" rid="ref14">14</xref>]. The advantage of our approach is that we do not need expert annotation of grade levels on documents, and annotation may be crowdsourced as in our experiments. Acquiring more personalized training examples is also possible without explicit curation, as user actions may be implicitly mined to generate document difficulty comparisons, by using information retrieval methods.</p>
        <p>Furthermore, unlike many other machine learning–based methods that require deep natural language processing, such as parsing [<xref ref-type="bibr" rid="ref28">28</xref>] and discourse analysis [<xref ref-type="bibr" rid="ref29">29</xref>], our choice of feature set is relatively simple. The surface features from readability formulas and word frequencies were both easy to calculate. Well-established tools also exist to generate word embeddings from large corpora. Therefore, our system could be easily deployed in an EHR system.</p>
        <p>Lastly, although traditional readability formulas are very easy to use by nontechnical users, as they do not require training a machine learning model, they are inaccurate in determining the difficulty of complex documents. With simple features and widely available software packages, our proposed method is straightforward to implement.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Patients’ access to their EHR notes has increased dramatically according to US national statistics. However, actively engaging patients in the management of their own health remains challenging. Assessing the readability of EHR notes and integrating educational assistance may make these notes more accessible for a layperson without professional training in medicine. To this end, we developed a new machine learning–based method to assess EHR readability from relative orders of text difficulty. We trained a learning-to-rank system to predict relative difficulty levels of given documents, instead of using the traditional classification approach, in which documents are assigned levels from a limited predefined set of values. Our experiments showed that this method significantly outperformed the widely used FKGL formula, and the improvement was consistent across different topics. Our system’s average concordance with an individual human user’s ratings was higher than the concordance between different human annotators. This method can potentially be personalized to individual users to better accommodate their background knowledge.</p>
      </sec>
    </sec>
  </body>
  <back>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AMT</term>
          <def>
            <p>Amazon Mechanical Turk</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">FKGL</term>
          <def>
            <p>Flesch-Kincaid Grade Level</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported in part by the Investigator Initiated Research grant 1I01HX001457-01 from the Health Services Research &#38; Development Program of the US Department of Veterans Affairs, and in part by the Center for Intelligent Information Retrieval, University of Massachusetts Amherst, USA. The content is solely the responsibility of the authors and does not represent the views of the US Department of Veterans Affairs, the US Government, or the Center for Intelligent Information Retrieval.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Greene</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Hibbard</surname>
            <given-names>JH</given-names>
          </name>
        </person-group>
        <article-title>Why does patient activation matter? An examination of the relationships between patient activation and health-related outcomes</article-title>
        <source>J Gen Intern Med</source>  
        <year>2012</year>  
        <month>05</month>  
        <volume>27</volume>  
        <issue>5</issue>  
        <fpage>520</fpage>  
        <lpage>6</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22127797"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1007/s11606-011-1931-2</pub-id>
        <pub-id pub-id-type="medline">22127797</pub-id>
        <pub-id pub-id-type="pmcid">PMC3326094</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Begum</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Donald</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ozolins</surname>
            <given-names>IZ</given-names>
          </name>
          <name name-style="western">
            <surname>Dower</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Hospital admissions, emergency department utilisation and patient activation for self-management among people with diabetes</article-title>
        <source>Diabetes Res Clin Pract</source>  
        <year>2011</year>  
        <month>08</month>  
        <volume>93</volume>  
        <issue>2</issue>  
        <fpage>260</fpage>  
        <lpage>7</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.diabres.2011.05.031</pub-id>
        <pub-id pub-id-type="medline">21684030</pub-id>
        <pub-id pub-id-type="pii">S0168-8227(11)00287-7</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hibbard</surname>
            <given-names>JH</given-names>
          </name>
          <name name-style="western">
            <surname>Greene</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>What the evidence shows about patient activation: better health outcomes and care experiences; fewer data on costs</article-title>
        <source>Health Aff (Millwood)</source>  
        <year>2013</year>  
        <month>02</month>  
        <volume>32</volume>  
        <issue>2</issue>  
        <fpage>207</fpage>  
        <lpage>14</lpage>  
        <pub-id pub-id-type="doi">10.1377/hlthaff.2012.1061</pub-id>
        <pub-id pub-id-type="medline">23381511</pub-id>
        <pub-id pub-id-type="pii">32/2/207</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mosen</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Schmittdiel</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Hibbard</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Sobel</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Remmers</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Bellows</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Is patient activation associated with outcomes of care for adults with chronic conditions?</article-title>
        <source>J Ambul Care Manage</source>  
        <year>2007</year>  
        <month>03</month>  
        <volume>30</volume>  
        <issue>1</issue>  
        <fpage>21</fpage>  
        <lpage>9</lpage>  
        <pub-id pub-id-type="medline">17170635</pub-id>
        <pub-id pub-id-type="pii">00004479-200701000-00005</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Henry</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Pylypchuk</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Patel</surname>
            <given-names>V</given-names>
          </name>
        </person-group>
        <source>Electronic capabilities for patients among U.S. non-federal acute care hospitals: 2012-2015. ONC data brief 38</source>  
        <year>2016</year>  
        <month>09</month>  
        <access-date>2016-10-07</access-date>
        <publisher-loc>Washington, DC</publisher-loc>
        <publisher-name>Office of the National Coordinator for Health Information Technology</publisher-name>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dashboard.healthit.gov/evaluations/data-briefs/hospitals-patient-engagement-electronic-capabilities-2015.php">http://dashboard.healthit.gov/evaluations/data-briefs/hospitals-patient-engagement-electronic-capabilities-2015.php</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6l5V1ZSLl"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Agarwal</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Hansberry</surname>
            <given-names>DR</given-names>
          </name>
          <name name-style="western">
            <surname>Sabourin</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Tomei</surname>
            <given-names>KL</given-names>
          </name>
          <name name-style="western">
            <surname>Prestigiacomo</surname>
            <given-names>CJ</given-names>
          </name>
        </person-group>
        <article-title>A comparative analysis of the quality of patient education materials from medical specialties</article-title>
        <source>JAMA Intern Med</source>  
        <year>2013</year>  
        <month>07</month>  
        <day>8</day>  
        <volume>173</volume>  
        <issue>13</issue>  
        <fpage>1257</fpage>  
        <lpage>9</lpage>  
        <pub-id pub-id-type="doi">10.1001/jamainternmed.2013.6060</pub-id>
        <pub-id pub-id-type="medline">23689468</pub-id>
        <pub-id pub-id-type="pii">1689983</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Fang</surname>
            <given-names>CH</given-names>
          </name>
          <name name-style="western">
            <surname>Agarwal</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Bhagat</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Eloy</surname>
            <given-names>JA</given-names>
          </name>
          <name name-style="western">
            <surname>Langer</surname>
            <given-names>PD</given-names>
          </name>
        </person-group>
        <article-title>Assessment of online patient education materials from major ophthalmologic associations</article-title>
        <source>JAMA Ophthalmol</source>  
        <year>2015</year>  
        <month>04</month>  
        <volume>133</volume>  
        <issue>4</issue>  
        <fpage>449</fpage>  
        <lpage>54</lpage>  
        <pub-id pub-id-type="doi">10.1001/jamaophthalmol.2014.6104</pub-id>
        <pub-id pub-id-type="medline">25654639</pub-id>
        <pub-id pub-id-type="pii">2107258</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Watad</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Bragazzi</surname>
            <given-names>NL</given-names>
          </name>
          <name name-style="western">
            <surname>Brigo</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Sharif</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Amital</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>McGonagle</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Shoenfeld</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Adawi</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Readability of Wikipedia pages on autoimmune disorders: systematic quantitative assessment</article-title>
        <source>J Med Internet Res</source>  
        <year>2017</year>  
        <month>07</month>  
        <day>18</day>  
        <volume>19</volume>  
        <issue>7</issue>  
        <fpage>e260</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2017/7/e260/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.8225</pub-id>
        <pub-id pub-id-type="medline">28720555</pub-id>
        <pub-id pub-id-type="pii">v19i7e260</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Brigo</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Otte</surname>
            <given-names>WM</given-names>
          </name>
          <name name-style="western">
            <surname>Igwe</surname>
            <given-names>SC</given-names>
          </name>
          <name name-style="western">
            <surname>Tezzon</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Nardone</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Clearly written, easily comprehended? The readability of websites providing information on epilepsy</article-title>
        <source>Epilepsy Behav</source>  
        <year>2015</year>  
        <month>03</month>  
        <volume>44</volume>  
        <fpage>35</fpage>  
        <lpage>9</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.yebeh.2014.12.029</pub-id>
        <pub-id pub-id-type="medline">25601720</pub-id>
        <pub-id pub-id-type="pii">S1525-5050(14)00699-4</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Brigo</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Erro</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>The readability of the English Wikipedia article on Parkinson's disease</article-title>
        <source>Neurol Sci</source>  
        <year>2015</year>  
        <month>06</month>  
        <volume>36</volume>  
        <issue>6</issue>  
        <fpage>1045</fpage>  
        <lpage>6</lpage>  
        <pub-id pub-id-type="doi">10.1007/s10072-015-2077-5</pub-id>
        <pub-id pub-id-type="medline">25596713</pub-id></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Davis</surname>
            <given-names>GT</given-names>
          </name>
          <name name-style="western">
            <surname>Singh</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Should patients get direct access to their laboratory test results? An answer with many questions</article-title>
        <source>JAMA</source>  
        <year>2011</year>  
        <month>12</month>  
        <day>14</day>  
        <volume>306</volume>  
        <issue>22</issue>  
        <fpage>2502</fpage>  
        <lpage>3</lpage>  
        <pub-id pub-id-type="doi">10.1001/jama.2011.1797</pub-id>
        <pub-id pub-id-type="medline">22122864</pub-id>
        <pub-id pub-id-type="pii">jama.2011.1797</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Koh</surname>
            <given-names>HK</given-names>
          </name>
          <name name-style="western">
            <surname>Brach</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Harris</surname>
            <given-names>LM</given-names>
          </name>
          <name name-style="western">
            <surname>Parchman</surname>
            <given-names>ML</given-names>
          </name>
        </person-group>
        <article-title>A proposed 'health literate care model' would constitute a systems approach to improving patients' engagement in care</article-title>
        <source>Health Aff (Millwood)</source>  
        <year>2013</year>  
        <month>02</month>  
        <volume>32</volume>  
        <issue>2</issue>  
        <fpage>357</fpage>  
        <lpage>67</lpage>  
        <pub-id pub-id-type="doi">10.1377/hlthaff.2012.1205</pub-id>
        <pub-id pub-id-type="medline">23381529</pub-id>
        <pub-id pub-id-type="pii">32/2/357</pub-id></nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Flesch</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>A new readability yardstick</article-title>
        <source>J Appl Psychol</source>  
        <year>1948</year>  
        <month>06</month>  
        <volume>32</volume>  
        <issue>3</issue>  
        <fpage>221</fpage>  
        <lpage>33</lpage>  
        <pub-id pub-id-type="medline">18867058</pub-id></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Goryachev</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Rosemblat</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Browne</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Keselman</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Zeng-Treitler</surname>
            <given-names>Q</given-names>
          </name>
        </person-group>
        <article-title>Beyond surface characteristics: a new health text-specific readability measurement</article-title>
        <source>AMIA Annu Symp Proc</source>  
        <year>2007</year>  
        <month>10</month>  
        <day>11</day>  
        <fpage>418</fpage>  
        <lpage>22</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18693870"/>
        </comment>  
        <pub-id pub-id-type="medline">18693870</pub-id>
        <pub-id pub-id-type="pmcid">PMC2655856</pub-id></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Leroy</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Miller</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Rosemblat</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Browne</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>A balanced approach to health information evaluation: a vocabulary-based naïve Bayes classifier and readability formulas</article-title>
        <source>J Am Soc Inf Sci</source>  
        <year>2008</year>  
        <month>07</month>  
        <volume>59</volume>  
        <issue>9</issue>  
        <fpage>1409</fpage>  
        <lpage>1419</lpage>  
        <pub-id pub-id-type="doi">10.1002/asi.20837</pub-id></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Redish</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Readability formulas have even more limitations than Klare discusses</article-title>
        <source>ACM J Comput Doc</source>  
        <year>2000</year>  
        <month>08</month>  
        <day>01</day>  
        <volume>24</volume>  
        <issue>3</issue>  
        <fpage>132</fpage>  
        <lpage>137</lpage>  
        <pub-id pub-id-type="doi">10.1145/344599.344637</pub-id></nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>O'Bryant</surname>
            <given-names>SE</given-names>
          </name>
          <name name-style="western">
            <surname>Lucas</surname>
            <given-names>JA</given-names>
          </name>
          <name name-style="western">
            <surname>Willis</surname>
            <given-names>FB</given-names>
          </name>
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>GE</given-names>
          </name>
          <name name-style="western">
            <surname>Graff-Radford</surname>
            <given-names>NR</given-names>
          </name>
          <name name-style="western">
            <surname>Ivnik</surname>
            <given-names>RJ</given-names>
          </name>
        </person-group>
        <article-title>Discrepancies between self-reported years of education and estimated reading level among elderly community-dwelling African-Americans: analysis of the MOAANS data</article-title>
        <source>Arch Clin Neuropsychol</source>  
        <year>2007</year>  
        <month>03</month>  
        <volume>22</volume>  
        <issue>3</issue>  
        <fpage>327</fpage>  
        <lpage>32</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://linkinghub.elsevier.com/retrieve/pii/S0887-6177(07)00014-5"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.acn.2007.01.007</pub-id>
        <pub-id pub-id-type="medline">17336494</pub-id>
        <pub-id pub-id-type="pii">S0887-6177(07)00014-5</pub-id></nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Manly</surname>
            <given-names>JJ</given-names>
          </name>
          <name name-style="western">
            <surname>Jacobs</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Touradji</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Small</surname>
            <given-names>SA</given-names>
          </name>
          <name name-style="western">
            <surname>Stern</surname>
            <given-names>Y</given-names>
          </name>
        </person-group>
        <article-title>Reading level attenuates differences in neuropsychological test performance between African American and white elders</article-title>
        <source>J Int Neuropsychol Soc</source>  
        <year>2002</year>  
        <month>03</month>  
        <volume>8</volume>  
        <issue>3</issue>  
        <fpage>341</fpage>  
        <lpage>8</lpage>  
        <pub-id pub-id-type="medline">11939693</pub-id></nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Manly</surname>
            <given-names>JJ</given-names>
          </name>
          <name name-style="western">
            <surname>Schupf</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Tang</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Stern</surname>
            <given-names>Y</given-names>
          </name>
        </person-group>
        <article-title>Cognitive decline and literacy among ethnically diverse elders</article-title>
        <source>J Geriatr Psychiatry Neurol</source>  
        <year>2005</year>  
        <month>12</month>  
        <volume>18</volume>  
        <issue>4</issue>  
        <fpage>213</fpage>  
        <lpage>7</lpage>  
        <pub-id pub-id-type="doi">10.1177/0891988705281868</pub-id>
        <pub-id pub-id-type="medline">16306242</pub-id>
        <pub-id pub-id-type="pii">18/4/213</pub-id></nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <source>Ranking readability demo</source>  
        <year>2018</year>  
        <access-date>2018-03-15</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://bio-nlp.org/readability-ranking">http://bio-nlp.org/readability-ranking</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6xwCIPM6x"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Keselman</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Slaughter</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>CA</given-names>
          </name>
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Divita</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Browne</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Tsai</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Zeng-Treitler</surname>
            <given-names>Q</given-names>
          </name>
        </person-group>
        <article-title>Towards consumer-friendly PHRs: patients' experience with reviewing their health records</article-title>
        <source>AMIA Annu Symp Proc</source>  
        <year>2007</year>  
        <fpage>399</fpage>  
        <lpage>403</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18693866"/>
        </comment>  
        <pub-id pub-id-type="medline">18693866</pub-id>
        <pub-id pub-id-type="pmcid">PMC2655877</pub-id></nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Pyper</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Amery</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Watson</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Crook</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Patients' experiences when accessing their on-line electronic patient records in primary care</article-title>
        <source>Br J Gen Pract</source>  
        <year>2004</year>  
        <month>01</month>  
        <volume>54</volume>  
        <issue>498</issue>  
        <fpage>38</fpage>  
        <lpage>43</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://bjgp.org/cgi/pmidlookup?view=long&#38;pmid=14965405"/>
        </comment>  
        <pub-id pub-id-type="medline">14965405</pub-id>
        <pub-id pub-id-type="pmcid">PMC1314776</pub-id></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Joachims</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Training linear SVMs in linear time</article-title>
        <year>2006</year>  
        <conf-name>12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
        <conf-date>Aug 20-23, 2006</conf-date>
        <conf-loc>Philadelphia, PA, USA</conf-loc>
        <publisher-loc>New York, NY</publisher-loc>
        <publisher-name>ACM</publisher-name>
        <fpage>217</fpage>  
        <lpage>226</lpage>  
        <pub-id pub-id-type="doi">10.1145/1150402.1150429</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Readability formulas and user perceptions of electronic health records difficulty: a corpus study</article-title>
        <source>J Med Internet Res</source>  
        <year>2017</year>  
        <month>03</month>  
        <day>02</day>  
        <volume>19</volume>  
        <issue>3</issue>  
        <fpage>e59</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2017/3/e59/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.6962</pub-id>
        <pub-id pub-id-type="medline">28254738</pub-id>
        <pub-id pub-id-type="pii">v19i3e59</pub-id>
        <pub-id pub-id-type="pmcid">PMC5355629</pub-id></nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Leroy</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Kauchak</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>The effect of word familiarity on actual and perceived text difficulty</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2014</year>  
        <month>02</month>  
        <volume>21</volume>  
        <issue>e1</issue>  
        <fpage>e169</fpage>  
        <lpage>72</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24100710"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/amiajnl-2013-002172</pub-id>
        <pub-id pub-id-type="medline">24100710</pub-id>
        <pub-id pub-id-type="pii">amiajnl-2013-002172</pub-id>
        <pub-id pub-id-type="pmcid">PMC3957403</pub-id></nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mikolov</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Corrado</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Dean</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Efficient estimation of word representations in vector space</article-title>
        <year>2013</year>  
        <conf-name>Workshop at ICLR</conf-name>
        <conf-date>May 2-4, 2013</conf-date>
        <conf-loc>Scottsdale, AZ, USA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kendall</surname>
            <given-names>MG</given-names>
          </name>
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>BB</given-names>
          </name>
        </person-group>
        <article-title>The problem of m rankings</article-title>
        <source>Ann Math Stat</source>  
        <year>1939</year>  
        <volume>10</volume>  
        <issue>3</issue>  
        <fpage>275</fpage>  
        <lpage>287</lpage> </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Schwarm</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Ostendorf</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Reading level assessment using support vector machines and statistical language models</article-title>
        <year>2005</year>  
        <conf-name>43rd Annual Meeting on Association for Computational Linguistics</conf-name>
        <conf-date>Jun 25-30, 2005</conf-date>
        <conf-loc>Ann Arbor, MI, USA</conf-loc>
        <publisher-loc>Stroudsburg, PA</publisher-loc>
        <publisher-name>Association for Computational Linguistics</publisher-name>
        <fpage>523</fpage>  
        <lpage>530</lpage>  
        <pub-id pub-id-type="doi">10.3115/1219840.1219905</pub-id></nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Feng</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Jansche</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Huenerfauth</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Elhadad</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>A comparison of features for automatic readability assessment</article-title>
        <year>2010</year>  
        <conf-name>23rd International Conference on Computational Linguistics (COLING )</conf-name>
        <conf-date>Aug 23-27, 2010</conf-date>
        <conf-loc>Beijing, China</conf-loc>
        <fpage>287</fpage>  
        <lpage>84</lpage> </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
