<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v6i4e45</article-id>
    <article-id pub-id-type="pmid">30497991</article-id>
    <article-id pub-id-type="doi">10.2196/medinform.9162</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Novel Approach to Cluster Patient-Generated Data Into Actionable Topics: Case Study of a Web-Based Breast Cancer Forum</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Attai</surname>
          <given-names>Deanna</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Hafez</surname>
          <given-names>Dina</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1" corresp="yes">
      <name name-style="western">
        <surname>Jones</surname>
        <given-names>Josette</given-names>
      </name>
      <degrees>PhD, RN</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <address>
        <institution>Health Informatics</institution>
        <institution>BioHealth Informatics Department</institution>
        <institution>Indiana University, Indianapolis</institution>
        <addr-line>535 West Michigan Street</addr-line>
        <addr-line>Indianapolis, IN, 46202</addr-line>
        <country>United States</country>
        <phone>1 3172748059</phone>
        <email>jofjons@iupui.edu</email>
      </address>  
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-4996-8595</ext-link></contrib>
      <contrib contrib-type="author" id="contrib2" equal-contrib="yes">
        <name name-style="western">
          <surname>Pradhan</surname>
          <given-names>Meeta</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4450-1897</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib3">
        <name name-style="western">
          <surname>Hosseini</surname>
          <given-names>Masoud</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4062-9092</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib4" equal-contrib="yes">
        <name name-style="western">
          <surname>Kulanthaivel</surname>
          <given-names>Anand</given-names>
        </name>
        <degrees>MIS</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-1912-7613</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib5">
        <name name-style="western">
          <surname>Hosseini</surname>
          <given-names>Mahmood</given-names>
        </name>
        <degrees>BS</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-6924-2978</ext-link>
      </contrib>
    </contrib-group>
    <aff id="aff1">
    <label>1</label>
    <institution>Health Informatics</institution>
    <institution>BioHealth Informatics Department</institution>  
    <institution>Indiana University, Indianapolis</institution>  
    <addr-line>Indianapolis, IN</addr-line>
    <country>United States</country></aff>
    <aff id="aff2">
      <label>2</label>
      <institution>Indiana Biosciences Research Institute</institution>
      <addr-line>Indianapolis, IN</addr-line>
      <country>United States</country>
    </aff>
    <aff id="aff3">
    <label>3</label>
    <institution>Purdue School of Science and Technology</institution>
    <institution>Purdue University, Indianapolis</institution>  
    <addr-line>Indianapolis, IN</addr-line>
    <country>United States</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Josette Jones 
      <email>jofjons@iupui.edu</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><season>Oct-Dec</season><year>2018</year></pub-date>
    <pub-date pub-type="epub">
      <day>29</day>
      <month>11</month>
      <year>2018</year>
    </pub-date>
    <volume>6</volume>
    <issue>4</issue>
    <elocation-id>e45</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>11</day>
        <month>10</month>
        <year>2017</year>
      </date>
      <date date-type="rev-request">
        <day>23</day>
        <month>11</month>
        <year>2017</year>
      </date>
      <date date-type="rev-recd">
        <day>14</day>
        <month>3</month>
        <year>2018</year>
      </date>
      <date date-type="accepted">
        <day>21</day>
        <month>6</month>
        <year>2018</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Josette Jones, Meeta Pradhan, Masoud Hosseini, Anand Kulanthaivel, Mahmood Hosseini. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 29.11.2018.</copyright-statement>
    <copyright-year>2018</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://medinform.jmir.org/2018/4/e45/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>The increasing use of social media and mHealth apps has generated new opportunities for health care consumers to share information about their health and well-being. Information shared through social media contains not only medical information but also valuable information about how the survivors manage disease and recovery in the context of daily life.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>The objective of this study was to determine the feasibility of acquiring and modeling the topics of a major online breast cancer support forum. Breast cancer patient support forums were selected to discover the hidden, less obvious aspects of disease management and recovery.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>First, manual topic categorization was performed using qualitative content analysis (QCA) of each individual forum board. Second, we requested permission from the Breastcancer.org Community for a more in-depth analysis of the postings. Topic modeling was then performed using open source software Machine Learning Language Toolkit, followed by multiple linear regression (MLR) analysis to detect highly correlated topics among the different website forums.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>QCA of the forums resulted in 20 categories of user discussion. The final topic model organized &#62;4 million postings into 30 manageable topics. Using qualitative analysis of the topic models and statistical analysis, we grouped these 30 topics into 4 distinct clusters with similarity scores of ≥0.80; these clusters were labeled Symptoms &#38; Diagnosis, Treatment, Financial, and Family &#38; Friends. A clinician review confirmed the clinical significance of the topic clusters, allowing for future detection of actionable items within social media postings. To identify the most significant topics across individual forums, MLR demonstrated that 6 topics—based on the Akaike information criterion values ranging from −642.75 to −412.32—were statistically significant.</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>The developed method provides an insight into the areas of interest and concern, including those not ascertainable in the clinic. Such topics included support from lay and professional caregivers and late side effects of therapy that consumers discuss in social media and may be of interest to clinicians. The developed methods and results indicate the potential of social media to inform the clinical workflow with regards to the impact of recovery on daily life.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>data interpretation</kwd>
      <kwd>natural language processing</kwd>
      <kwd>patient-generated information</kwd>
      <kwd>social media</kwd>
      <kwd>statistical analysis</kwd>
      <kwd>infodemiology</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Health care is currently undergoing transformation by capitalizing on information technology and patient-consumer engagement and activation through health information technology such as patient portals and mHealth apps. Consumer engagement is assumed to strengthen providers’ abilities to tailor their care to the consumers’ needs, preferences, and abilities. The increasing use of smartphones, mobile apps, and remote monitoring devices, coupled with providers’ deployments of electronic health records, patient portals, and secure messaging, offers innovative ways to connect patients and providers and to strengthen consumers’ engagement in their health and well-being [<xref ref-type="bibr" rid="ref1">1</xref>]. In addition, health consumers have embraced social media, enabling them to share and discuss how they manage their health and well-being with others with similar health issues. These social media and mHealth apps generate important data outside the health care settings and, when shared with providers, expand the depth, breadth, and continuity of information available to optimize health care and outcomes.</p>
      <p>Despite the proliferation of social media use, such as blogs and forums, little is known about the scope and quality of information shared, or the purposes that social media sites serve for consumer decisional and support needs [<xref ref-type="bibr" rid="ref2">2</xref>]. Social media retains large amounts of valuable information about consumers’ contextual and environmental (day-to-day) factors while managing their health and well-being; such issues form a major foundation of human health. However, analyzing those free-text data to discover these <italic>hidden</italic> aspects of health consumers’ lives and understand their health information needs beyond those routinely addressed by health care providers is challenging [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>This study explores approaches for analyzing the social media data and extract potential valuable information on managing health and well-being beyond the context of health care. As it is known that breast cancer patients and survivors often join social media to fulfill their information needs and discuss their daily challenges and concerns, all or not related to health [<xref ref-type="bibr" rid="ref4">4</xref>], using those venues is apparent. Some concerns might not be shared with health care providers for many reasons. For example, patients might think it is not necessary to discuss the topic, may feel embarrassed about the issue, or they do not even know there is a problem [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Hence, we explored ways to discover issues that are not commonly shared but are important for the overall health and well-being.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>First, manual topic categorization was performed using qualitative content analysis (QCA) of each individual forum board. Second, we requested permission from the Breastcancer.org Community for a more in-depth analysis of the postings. In addition, natural language processing (NLP) and statistical modeling approach were used to cluster &#62;4 million postings into manageable topics. Finally, topic modeling was performed using open source software, followed by multiple linear regression (MLR) analysis to detect highly correlated topics among the different website forums. The methodology is outlined in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p>
      </sec>
      <sec>
        <title>Manual Categorization of Posts</title>
        <p>A Google search was performed for breast cancer forum websites. Selection criteria were active websites (having posts in the week of search) and in the English language. Each website must have at least 5000 members or have a minimum of 50,000 posts in total, and the posts on the site must be organized into categories. Among the resulting 20 websites, 5 were included on the basis of the selection criteria. These 5 remaining websites contained 4,901,516 posts organized in 211 forums (<xref ref-type="table" rid="table1">Table 1</xref>). The forum posts were further manually analyzed for consensus among the team members.</p>
        <p>Team members were assigned to review the titles of 211 forums across the 5 forum websites and organize them into 4 main top-level categories and 16 subcategories correlating to several domains from the report on the social determinants of health published by the Institute of Medicine [<xref ref-type="bibr" rid="ref12">12</xref>]. The quantities of posts belonging to each category and subcategory were calculated by the forum that the posts belonged to. The 3 most dominant subcategories across all websites were as follows: Treatments (1.49/4.90 million, 30.5%, posts), Diagnosed–Psychosocial Support–Similar Patients (1.34/4.90 million, 27.3%, posts), and Diagnosed–Psychosocial Support–Life (0.83/4.90 million, 16.9%, posts). After the posts were categorized, the research team iteratively validated (with a random sample of 20 posts from each forum) and consolidated the initial categorization, assuring the quality and correctness of the method.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Overview of the methods used to analyze the study content.</p>
          </caption>
          <graphic xlink:href="medinform_v6i4e45_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Breast cancer websites explored.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="340"/>
            <col width="300"/>
            <col width="70"/>
            <col width="70"/>
            <col width="70"/>
            <col width="70"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td>Website name</td>
                <td>Site URL</td>
                <td>Country</td>
                <td>Forums</td>
                <td>Threads</td>
                <td>Posts</td>
                <td>Members</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="bottom">
                <td>Breastcancer.org Community [<xref ref-type="bibr" rid="ref7">7</xref>]</td>
                <td>community.breastcancer.org</td>
                <td>US<sup>a</sup></td>
                <td>80</td>
                <td>121,688</td>
                <td>3,608,324</td>
                <td>153,620</td>
              </tr>
              <tr valign="bottom">
                <td>Breast Cancer Care [<xref ref-type="bibr" rid="ref8">8</xref>]</td>
                <td>forum.breastcancercare.co.uk</td>
                <td>UK<sup>b</sup></td>
                <td>54</td>
                <td>36,949</td>
                <td>782,486</td>
                <td>N/A<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>Susan G Komen Foundation: Message Board [<xref ref-type="bibr" rid="ref9">9</xref>]</td>
                <td>apps.komen.org/Forums</td>
                <td>US</td>
                <td>24</td>
                <td>44,175</td>
                <td>354,592</td>
                <td>26,883</td>
              </tr>
              <tr valign="top">
                <td>Triple Negative Breast Cancer: Forums [<xref ref-type="bibr" rid="ref10">10</xref>]</td>
                <td>forum.tnbcfoundation.org</td>
                <td>US</td>
                <td>17</td>
                <td>9641</td>
                <td>100,706</td>
                <td>123,427</td>
              </tr>
              <tr valign="top">
                <td>No Surrender Breast Cancer Foundation [<xref ref-type="bibr" rid="ref11">11</xref>]</td>
                <td>nosurrenderbreastcancersupportforum.com</td>
                <td>US</td>
                <td>36</td>
                <td>2443</td>
                <td>55,498</td>
                <td>5549</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>US: United States.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>UK: United Kingdom.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Data Extraction, Natural Language Processing, and Statistical Modeling</title>
        <p>Data from a public breast cancer internet discussion forum were extracted, cleaned, and processed; multiple approaches merging NLP with statistical modeling were implemented for knowledge discovery. In addition, off-the-shelf products were used to develop and streamline the analytical approach to cluster most-occurring topics of discussions. The methodology developed revealed several topics that may be of importance for care planning and, thus, need to be incorporated in the electronic health record. In addition, advanced text mining will be a foundation for predictive modeling of consumers’ health information needs and provide interactive solutions.</p>
      </sec>
      <sec>
        <title>Extracting (Scraping) Forum Data</title>
        <p>Postings from the Breastcancer.org Community website were selected for further analyses, as this website contained the highest number of total posts (3.61/4.90 million 73.6%, posts across all 5 websites selected in this study). Permission was obtained from the Web administrators to download and analyze all the data logged in the site.</p>
        <p>The Breastcancer.org Community site includes 80 main forums organized by the site administrators into 9 sections. Users self-select in which forum, and thus in which section, a post that they make will go. To capture information within the forum posts, an in-house scraping tool in the <italic>PHP Hypertext Preprocessor</italic> language was developed by a team member. Forum metadata along with the actual posts were extracted; the text within the different posts was aggregated into 80 text files each corresponding to a forum. The files were named based on the forum ID number. The data were saved in the JavaScript Object Notation format. <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref> shows the forum names along with the number of threads and posts in each forum.</p>
      </sec>
      <sec>
        <title>Applying Topic Modeling</title>
        <p>Topic models provide a simple way to analyze large volumes of unlabeled text. A “topic” consists of a cluster of words that frequently occur together. Using contextual clues, topic models can connect words with similar meanings and distinguish between uses of words with multiple meanings. One of the leading approaches used for topic modeling is Latent Dirichlet allocation (LDA), which is one of the most popular methods in NLP [<xref ref-type="bibr" rid="ref13">13</xref>]. LDA represents a document as a distribution of “topics,” where a topic is itself a distribution over words (and may or may not be similar to a forum topic). Looping through each word in every document, the LDA algorithm assigns every word to a temporary topic in a semirandom manner and iteratively updates topic assignments. For each word, its topic assignment is subsequently updated based on 2 criteria as follows: (1) the prevalence of the word across all topics and (2) the prevalence of the topics within the documents.</p>
        <p>The Machine Learning Language Toolkit (MALLET) open source tool (University of Massachusetts, Amherst, MA, USA) [<xref ref-type="bibr" rid="ref14">14</xref>] was used to execute the LDA algorithm on the data to extract the main topics. MALLET is a Java-based tool developed at the University of Massachusetts Amherst, which is used for the analysis of data in a textual format such as document classification, clustering, topic modeling, information extraction, and other machine learning apps. After scraping the forum data and saving into 80 files representing each forum, the files were imported into the MALLET tool. MALLET generates two tab-delimited text files as a result of algorithm execution. One file contains the topic ID, and the words related to that topic (aka the topic keywords; <xref ref-type="table" rid="table2">Table 2</xref>).</p>
        <p>MALLET was run iteratively, customized to generate 15 topics, 20 topics, and 30 topics, respectively. Topic labels were added by consensus of the research team based on the semantics of the word cluster. Some topic labels in different sets of topics were identical based on the semantic similarity, but the topic words and strength are different for each of the 3 sets generated. No new topics were generated at the third iteration; the MALLET categorization of 30 word baskets was used for further analysis.</p>
        <p>For each iteration and each file, the topic composition and corresponding LDA strength were computed, providing us a way to infer the latent structure of the text file. The resulting output is a topic ID-by-text file matrix known as a file-feature set (<xref ref-type="table" rid="table3">Table 3</xref>). The first column shows the name of the file; the rest of the columns are best considered as (topic-ID, topic-strength) pairs. For example, it is noted that file F100 has a Topic 12 strength of 0.275 (27.5%). For each document, there are as many of these pairs as there are topics, although only the top 5 topics for each file and the first 4 files are shown for brevity.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The partial table of topics generated by Machine Learning Language Toolkit in the 30-topic model, with interpretations (the list goes on up to the 30th topic; only 3 are shown for brevity).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="300"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Machine Learning Language Toolkit topic identifier</td>
                <td>Topic label</td>
                <td>Topic keywords</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>Diagnostic testing and waiting for results</td>
                <td>breast biopsy cancer lump results ultrasound benign surgeon mammogram doctor mri waiting back mammo good radiologist feel pain left i'm</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>Side effects of inflammation and its treatment</td>
                <td>breast ibc skin symptoms pain rash red cancer nipple biopsy infection diagnosed antibiotics swollen treatment left specialist redness swelling lymph</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>Positive results after recurrence</td>
                <td>chemo stage years cancer treatment nodes onc tumor triple negative taxol positive rads year diagnosed node recurrence congratulations lymph radiation</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>A portion of the file-feature set generated by Machine Learning Language Toolkit software (the list goes on up to the 80th file and 30th topic; values were truncated for brevity of display).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <thead>
              <tr valign="bottom">
                <td>File identifier<sup>a</sup></td>
                <td>Topic ID<sup>b</sup></td>
                <td>Strength<sup>c</sup></td>
                <td>Topic ID</td>
                <td>Strength</td>
                <td>Topic ID</td>
                <td>Strength</td>
                <td>Topic ID</td>
                <td>Strength</td>
                <td>Topic ID</td>
                <td>Strength</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>F100</td>
                <td>12</td>
                <td>0.275</td>
                <td>18</td>
                <td>0.269</td>
                <td>2</td>
                <td>0.251</td>
                <td>5</td>
                <td>0.06</td>
                <td>7</td>
                <td>0.053</td>
              </tr>
              <tr valign="top">
                <td>F102</td>
                <td>2</td>
                <td>0.542</td>
                <td>18</td>
                <td>0.136</td>
                <td>7</td>
                <td>0.087</td>
                <td>12</td>
                <td>0.056</td>
                <td>1</td>
                <td>0.04</td>
              </tr>
              <tr valign="top">
                <td>F104</td>
                <td>2</td>
                <td>0.315</td>
                <td>14</td>
                <td>0.118</td>
                <td>1</td>
                <td>0.104</td>
                <td>7</td>
                <td>0.09</td>
                <td>20</td>
                <td>0.043</td>
              </tr>
              <tr valign="top">
                <td>F105</td>
                <td>2</td>
                <td>0.295</td>
                <td>11</td>
                <td>0.25</td>
                <td>6</td>
                <td>0.213</td>
                <td>7</td>
                <td>0.067</td>
                <td>14</td>
                <td>0.042</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Scraped forum file.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>b</sup>Topic identifier: Machine Learning Language Toolkit-generated topics.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>c</sup>Weight of topic in the file.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Statistical Analyses</title>
        <p>The output from MALLET assigned weight scores (ie, topic-strength) to each topic-ID within each file. Statistical analysis was carried out (1) to understand the similarity across the feature sets and files and (2) to identify the topics that are most relevant to patients with breast cancer. Euclidean Distance Similarity Measures (EDSM) were computed to evaluate the similarity across the files based on their weight scores for each topic. Equation (1) is an example of how each file and its feature vectors were assigned, using file F100 as an example.</p>
        <disp-quote>
          <p>F100 = [Topic6(.33), Topic1(.28), Topic9(.20), Topic2(.08), Topic0(.05), Topic3(.04), Topic12(.01), Topic4(.01), Topic n(weight m)…]</p>
          <attrib>1</attrib>
        </disp-quote>
        <p>The EDSM between all potential file pairs were computed on the basis of Equation (2):</p>
        <graphic xlink:href="medinform_v6i4e45_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>Where <italic>i</italic> and <italic>j</italic> are identifiers for each file pair, <italic>k</italic> is the total number of topics in the dataset (ie, 15, 20, or 30 topics), and <italic>x</italic> is the topic weight score in each file.</p>
        <p>To identify the most relevant topics in the dataset, an MLR analysis was performed on all files. The MLR analysis was computed using the R Statistical Package [<xref ref-type="bibr" rid="ref15">15</xref>]. The MLR analysis identified the topics that seem to be most relevant within and across the forums in the study. The equation model (Equation 3) for the MLR used is:</p>
        <graphic xlink:href="medinform_v6i4e45_fig11.PNG" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>Where <italic>topic<sub>1</sub></italic>, <italic>topic<sub>2</sub></italic> are the weight scores of the topics in the files; <italic>ε</italic><sub>i</sub> is the error in the model; β<sub>0</sub> is the intercept; <italic>β</italic><sub>1</sub>, <italic>β</italic><sub>2</sub> are the coefficients for the <italic>topic<sub>1</sub></italic>, <italic>topic<sub>2</sub></italic>, respectively, computed by the model; and y<sub>i</sub> is the outcome (dependent variable) for each file <italic>i</italic>.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Manual Categorization of Posts</title>
        <p>Among the 5 sites studied, Breastcancer.org Community presented the majority of the total post volume. From all websites analyzed for post counts, 73.6% (3.61/4.90 million) of posts were from Breastcancer.org Community; hence, we selected this site for further exploration.</p>
      </sec>
      <sec>
        <title>Manual Categorization</title>
        <p>Initially, the research team performed via QCA a manual categorization of topics discussed in the 5 selected public websites. The popularity distribution of the manually generated categories, as discussed in the Methods section, was assessed by the number of posts made in the forums. For example, the qualitatively generated categories <italic>Diagnosed—Psychosocial— Similar Patients</italic> have an overall popularity of 41,972 posts. The forum categorization was either general or granular depending on the forum structure. The distribution of qualitative categories across the threads on the Breastcancer.org Community website is visualized in the figures below by frequencies of category popularity (<xref ref-type="fig" rid="figure2">Figure 2</xref>); 20 of our QCA-generated categories mapped to forums on breastcancer.org. The <italic>x</italic>-axis shows the QCA-assigned category names.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Distribution of qualitative content analysis-generated categories according to the number of forum threads that each manual category possesses. DX: Diagnosed; TLD: Top-Level Domain; NDC: Not Diagnosed but Concerned.</p>
          </caption>
          <graphic xlink:href="medinform_v6i4e45_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Extraction, Natural Language Processing, and Statistical Modeling</title>
        <sec>
          <title>Data Extraction</title>
          <p>The data for all 80 forums on the Breastcancer.org Community website were successfully extracted into 80 files, each containing all communications posted over its respective forum.</p>
        </sec>
        <sec>
          <title>Natural Language Processing: Topic Modeling</title>
          <p>As mentioned earlier, exhaustion was reached at baskets of 30 cooccurring words. The remainder of the analyses will be only for these topics. All machine-generated topics were assigned topic labels based on the semantics of the word cluster and validated by a domain expert (clinical); the topic ID was equated with the term key. MALLET assigned an LDA strength to each topic indicating its overall dominance across all forum files that were analyzed. Two example topics, IDs #8 and #29, are listed in <xref ref-type="table" rid="table4">Table 4</xref> (below), along with the authors’ labels for these topics. <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref> provides a full list of generated topics from this model as well as the authors’ semantic interpretations. Each file represents the text of one forum, and topic-strength pairs for the strongest five topics per MALLET LDA analysis of that file are found to the right of the file’s ID. For any file, the strength across all 30 topics will always add up to 1.00.</p>
          <p>MALLET also correlated the topic relationship strengths between all files based on their topics. Strengths assigned to document-topic pairs by MALLET ranged from almost zero (&#60;0.000001) up to 0.796. The maximum theoretical possible strength for a single file-topic pair would be 1.00. <xref ref-type="app" rid="app3">Multimedia Appendix 3</xref> provides a list of the top 5 correlated topics of each file.</p>
        </sec>
        <sec>
          <title>Statistical Analysis: Euclidean Distance Similarity Measures</title>
          <p>EDSM were calculated to find the similarity between files. <xref ref-type="fig" rid="figure3">Figure 3</xref> shows the file-file similarity matrix and a subset of the similarity matrix. The files are mirrored across both axes and ordered by their alphabetical file name (with F100 being first and F99 being last). Darker cells indicate that the files were more similar. File 109 has a similarity measure of 0.89 with file 104; similarly, file 108 has a similarity measure of 0.78 with file 104.</p>
          <p><xref ref-type="table" rid="table5">Table 5</xref> illustrates the similarity measures among the file pairs with a similarity score ≥0.8.</p>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Topics #8 and #29 with Latent Dirichlet allocation strengths author topic label interpretations.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="120"/>
              <col width="250"/>
              <col width="400"/>
              <col width="230"/>
              <thead>
                <tr valign="top">
                  <td>Topic identifier</td>
                  <td>Latent Dirichlet allocation strength</td>
                  <td>Topic words</td>
                  <td>Authors’ topic label</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>8</td>
                  <td>1.38724</td>
                  <td>cancer chemo years feel life family mom time support things breast people treatment don’t husband care friends diagnosed talk mother</td>
                  <td>Hope, love, family, and friends</td>
                </tr>
                <tr valign="top">
                  <td>29</td>
                  <td>0.19954</td>
                  <td>hair book pink survivor happy deb health country president shirley obama congratulations cats article eye mammo fumi beth beautiful vote</td>
                  <td>Daily living and breast cancer</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>File-file similarity matrix.</p>
            </caption>
            <graphic xlink:href="medinform_v6i4e45_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Top scored file-file similarity measures.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="400"/>
              <col width="600"/>
              <thead>
                <tr valign="top">
                  <td>File identifier</td>
                  <td>Associated files (similarity score)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>F102</td>
                  <td>F133 (0.85), F144 (0.91), F152 (0.97), F116 (0.95)</td>
                </tr>
                <tr valign="top">
                  <td>F104</td>
                  <td>F109 (0.89), F142 (0.81), F150 (0.82), F27 (0.89)</td>
                </tr>
                <tr valign="top">
                  <td>F108</td>
                  <td>F132 (0.94), F137 (0.86), F145 (0.97), F5 (0.90), F71 (0.93), F88 (0.86), F96 (0.97)</td>
                </tr>
                <tr valign="top">
                  <td>F109</td>
                  <td>F104 (0.89), F142 (0.89), F127 (0.85)</td>
                </tr>
                <tr valign="top">
                  <td>F110</td>
                  <td>F26 (0.80)</td>
                </tr>
                <tr valign="top">
                  <td>F111</td>
                  <td>F132 (0.8), F68 (0.87)</td>
                </tr>
                <tr valign="top">
                  <td>F112</td>
                  <td>F47 (0.92), F93 (0.94)</td>
                </tr>
                <tr valign="top">
                  <td>F113</td>
                  <td>F139 (0.89), F55 (0.87)</td>
                </tr>
                <tr valign="top">
                  <td>F133</td>
                  <td>F102 (0.85), F135 (0.86), F145 (0.90), F5 (0.87), F71 (0.94), F96 (0.94), F88 (0.87)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p>A cursory review of the files with high similarity also revealed clinical relevance and connection. For example, files F108, 132, and 145, while in different discussion categories on the website, all discuss the diagnosis, treatment, and potential side effects from the treatments and also discuss living with different stages and types of breast cancer. In addition, F96 has a high similarity (97%) with F108, which is devoted to the breast cancer type known as invasive ductal carcinoma. F112 discusses more specific genetic risks of breast cancer (BRCA1 or BRCA2 positive), while F47 (similarity 92%) chats about more general risks. F93 at first glance seems not related (Comments, Suggestions, Feature Requests), but reading the postings revealed the need for more information and social support for users who find out that they are <italic>at risk</italic> for breast cancer.</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Topic-topic similarity matrix.</p>
            </caption>
            <graphic xlink:href="medinform_v6i4e45_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p><xref ref-type="fig" rid="figure4">Figure 4</xref> shows 4 clusters of highly correlated computationally modeled topics. Each cluster is labeled with a letter, and topics are labeled as <italic>keys</italic>. The topics with their consensus labels based on the semantic meaning of the word baskets are as follows:</p>
          <p>Cluster A: Symptoms &#38; Diagnosis</p>
          <list list-type="bullet">
            <list-item>
              <p><italic>Topic1</italic>: Diagnostic testing and waiting for result</p>
            </list-item>
            <list-item>
              <p><italic>Topic 7</italic>: Genetic risk and testing</p>
            </list-item>
            <list-item>
              <p><italic>Topic 30</italic>: Symptoms and diagnosis of recurrence</p>
            </list-item>
            <list-item>
              <p><italic>Topic 3</italic>: Positive results after recurrence</p>
            </list-item>
            <list-item>
              <p><italic>Topic 26</italic>: Positive results after treatment for recurrence</p>
            </list-item>
            <list-item>
              <p><italic>Topic 9</italic>: Diagnostic and treatment observation for recurrence</p>
            </list-item>
            <list-item>
              <p><italic>Topic 14</italic>: Medical drug treatment and long-term effects</p>
            </list-item>
          </list>
          <p>Cluster B: Treatment</p>
          <list list-type="bullet">
            <list-item>
              <p><italic>Topic18</italic>: Chemotherapy side effects and change of treatment</p>
            </list-item>
            <list-item>
              <p><italic>Topic 22</italic>: General feeling over time</p>
            </list-item>
            <list-item>
              <p><italic>Topic 23</italic>: Medical or drug treatment and side effects</p>
            </list-item>
            <list-item>
              <p><italic>Topic15</italic>: Physical activities during and after chemo</p>
            </list-item>
            <list-item>
              <p><italic>Topic 20</italic>: Side effects of breast cancer treatment</p>
            </list-item>
            <list-item>
              <p><italic>Topic16</italic>: Surgical treatments while in remission</p>
            </list-item>
            <list-item>
              <p><italic>Topic 21</italic>: Lingering side effects while in remission</p>
            </list-item>
            <list-item>
              <p><italic>Topic27</italic>: Surgical reconstruction during remission</p>
            </list-item>
            <list-item>
              <p><italic>Topic 5</italic>: Prognosis about relapse or recurrence</p>
            </list-item>
            <list-item>
              <p><italic>Topic 25</italic>: Support from caregivers and medical team for recovery long term</p>
            </list-item>
            <list-item>
              <p><italic>Topic 17</italic>: Nutrition and supplements</p>
            </list-item>
            <list-item>
              <p><italic>Topic 2</italic>: Side effect of inflammation and its treatment</p>
            </list-item>
            <list-item>
              <p><italic>Topic 10</italic>: Radiation side effects and duration of the effects.</p>
            </list-item>
          </list>
          <p>Cluster C: Financial</p>
          <list list-type="bullet">
            <list-item>
              <p><italic>Topic 24</italic>: Financial issues over time</p>
            </list-item>
            <list-item>
              <p><italic>Topic 11</italic>: Forum-related discussion for support from people in similar circumstances</p>
            </list-item>
            <list-item>
              <p><italic>Topic 13:</italic> Looking for clinical research and trials</p>
            </list-item>
          </list>
          <p>Cluster D: Friends &#38; Family</p>
          <list list-type="bullet">
            <list-item>
              <p><italic>Topic 4</italic>: Friends and fun</p>
            </list-item>
            <list-item>
              <p><italic>Topic 29</italic>: Everyday life and breast cancer</p>
            </list-item>
            <list-item>
              <p><italic>Topic 28</italic>: Spirituality and religion</p>
            </list-item>
            <list-item>
              <p><italic>Topic 19</italic>: feeling back to normal</p>
            </list-item>
            <list-item>
              <p><italic>Topic 8</italic>: Hope, love, family, and friends</p>
            </list-item>
            <list-item>
              <p><italic>Topic 12</italic>: Feeling positive and support</p>
            </list-item>
          </list>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>Most significant topics identified via multiple linear regression analysis.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="120"/>
              <col width="630"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Topic identifier</td>
                  <td>Topic label</td>
                  <td>Akaike information criterion values</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>21</td>
                  <td>Lingering side effects while in remission</td>
                  <td>−642.75</td>
                </tr>
                <tr valign="top">
                  <td>18</td>
                  <td>Chemotherapy side effects and change of treatment</td>
                  <td>−641.98</td>
                </tr>
                <tr valign="top">
                  <td>10</td>
                  <td>Radiation and side effects</td>
                  <td>−633.17</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>Genetic risk and testing</td>
                  <td>−620.41</td>
                </tr>
                <tr valign="top">
                  <td>25</td>
                  <td>Support from caregiver and medical team for recovery long term</td>
                  <td>−571.78</td>
                </tr>
                <tr valign="top">
                  <td>11</td>
                  <td>Looking for support from people in similar circumstances</td>
                  <td>−412.32</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p>As can be deduced from the semantic labeling, each cluster describes a theme: cluster A is related to risk factors, diagnosis, and potential risk of recurrence, whereas cluster B describes different treatments and their side effects in the short and long term. Cluster C and D are less clinically and more oriented to patient contextual factors (ie, those that are typically ascertainable only outside of the clinic encounters).</p>
        </sec>
        <sec>
          <title>Statistical Analyses: Multiple Linear Regression Analysis</title>
          <p>Finally, MLR analysis was performed to identify the most significant topics (keys) across the 4 abovementioned clusters. The topics were arranged in a descending order based on the Akaike information criterion value, the most appropriate measure for the methodology. The most significant topics identified by the model were: Topic21&#62; Topic18&#62; Topic10&#62; Topic7&#62; Topic25&#62; Topic11. <xref ref-type="table" rid="table6">Table 6</xref> reports the most significant topics discussed among the forum participants, along with the respective topic labels assigned by the authors.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>It is well known that many users share information online daily. Forum posts, blogs, or other social media activity reveal a rich diary of everyday life. Health information is revealed explicitly when an individual communicates about their well-being or when they ask for guidance, information on a very specific health issue, treatment, and other related topics. Our goal was to explore a method to enhance our ability to collect and interpret information from those social media sources. Our methodology allowed us to organize 4 million plus postings into 30 topics, consequently clustered into 4 groups.</p>
        <p>The popularity of QCA-generated categories (as measured by the number of posts in their associated forums) showed a logarithmic-linear (<italic>log-lin)</italic> distribution, strongly suggesting that a few QCA-generated categories are disproportionately gravitated toward user self-selection, while most topics receive comparatively little attention. Moreover, it is of great interest that topic modeling analysis via MALLET showed that the overall LDA strengths of each topic among the forum documents (as seen in <xref ref-type="fig" rid="figure3">Figure 3</xref>) also followed a log-<italic>lin</italic> type of distribution, allowing for the same type of conclusion in objectively quantifying content with regards to the MALLET-generated topics.</p>
        <p>In addition, a modest level of correlation was observed between the strongest (via MLR analysis) MALLET topics and the strength (by user posting) of manual QCA-generated categories. Topic 11, <italic>looking for support from people in similar circumstances</italic>, is almost semantically identical to the QCA-generated category of <italic>Diagnosed—Psychosocial—Similar Patients.</italic> The latter category encompassed 34.4% (41,972/121,688 of all threads on the Breastcancer.org Community website. Topics 10 and 18 (<italic>Radiation and side effects</italic> and <italic>Chemotherapy side effects and change of treatments</italic>, respectively), meanwhile, correlate in general to the manually generated category of <italic>Diagnosed—Treatment</italic> (top-level domain), which covered 38,698 threads on the site. Topic 7 (<italic>genetic risk and testing</italic>) is semantically similar to the QCA-generated category of <italic>Not Diagnosed But Concerned—Testing</italic>, noted in 4519 site user threads.</p>
        <p>The computationally assigned importance of Topic 11 when combined with its equally significant manually generated category correlate demonstrates the need for health personnel to take into account the contextual (nonclinical) factors unlikely to be captured in conventional medical documentation and not supported by conventional clinic-based information technology resources. In particular, a greater emphasis on information-mediated psychosocial interventions is supported by the results of this research.</p>
        <p>The computational topic modeling analysis via MALLET also demonstrated topics that did not arise via manual category generation. These topics, in particular Topic 21 (<italic>lingering side effects while in remission)</italic> and Topic 25 (<italic>support from caregiver and medical team for recovery long term</italic>), mirror breast cancer survivorship instead of the disorder itself. Therefore, it is suggested that computational topic modeling software such as MALLET is useful in future research on large bodies of patient-generated text and can generate topics similar in quality to those generated by expert QCA; furthermore, this type of software can detect significant but <italic>hidden</italic> topics (such as social and daily living issues dealt with during survivorship) that are not otherwise detectable when only the forum labels given by a site are analyzed qualitatively.</p>
        <p>Visual analysis of the file-file (ie, forum-forum) similarity matrix (see <xref ref-type="fig" rid="figure4">Figure 4</xref>) shows a particular concentration of similar files across the diagonal axis, indicating that files numbered with close numbers tended to be more similar in content. This observation actually does strengthen the case for using computational topic modeling software such as MALLET because closely numbered files in the study at hand tended to originate from forums that resided in identical categories on the Breastcancer.org Community website. Similarly, topics that closely correlated with each other were noted to have clinically significant correlates. It is important to note that many of these correlates may not have been intuitive at face value but were more explainable with the clinical expertise.</p>
        <p>Overall, the research team was able to gain significant insight into the daily lives, clinical and otherwise, of patients affected by breast cancer; the onus to support survivors of breast cancer was also revealed. Furthermore, the research performed generated significant support for the use of computational topic modeling software such as MALLET to analyze patient-generated information for nonclinical issues revealed by patients with breast cancer over relevant disease-specific online forums.</p>
      </sec>
      <sec>
        <title>Limitations and Future Considerations</title>
        <p>The data could be better annotated in metadata-facilitated context as opposed to being in a purely free-text format; the granularity of the ontology in which the data are stored can be improved in future research. In particular, having the posts traceable to unique anonymized users would be of assistance. To achieve this granularity, forums in the future can be scraped in a manner that preserves the HTML source code of their content; structured information could be then extracted from the HTML. The granularity of time, if extractable from the HTML, could potentially facilitate the generation of individual patient records and potentially even allow for the capability for analyzing patient narratives in a longitudinal (time-wise) fashion.</p>
        <p>Detailed analysis of similarity measures between files and clustering methods is an important part of potential future research and will require thorough analysis by clinical and patient health experts. This process is very time consuming and is out of the scope of this study as our goal in this work is to present a method for modeling a data in a meaningful format. Future work will focus more on further analysis of data to identify hidden relationships between files and topics that might reveal hidden aspects of breast cancer patients’ challenges in their real life.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>The importance of patient-generated data (including patient-generated information via online communications) is growing among scholars because of their value in identifying hidden aspects of patients’ challenges and concerns. This study provides a reasonable amount of insight into the areas of interest or concern that patients with breast cancer discuss in social media and may need to be addressed to optimize patient disease and health management.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>The 80 forums and their groupings on the Breastcancer.org Community website.</p>
        <media xlink:href="medinform_v6i4e45_app1.pdf" xlink:title="PDF File (Adobe PDF File), 112KB"/>
      </app>
      <app id="app2">
        <title>Multimedia Appendix 2</title>
        <p>File-feature set.</p>
        <media xlink:href="medinform_v6i4e45_app2.pdf" xlink:title="PDF File (Adobe PDF File), 107KB"/>
      </app>
      <app id="app3">
        <title>Multimedia Appendix 3</title>
        <p>List of topics generated by MALLET in the 30 topic model.</p>
        <media xlink:href="medinform_v6i4e45_app3.pdf" xlink:title="PDF File (Adobe PDF File), 117KB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">EDSM</term>
          <def>
            <p>Euclidean Distance Similarity Measures</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LDA</term>
          <def>
            <p>Latent Dirichlet allocation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MALLET</term>
          <def>
            <p>Machine Learning Language Toolkit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MLR</term>
          <def>
            <p>multiple linear regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">QCA</term>
          <def>
            <p>qualitative content analysis</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to acknowledge Breastcancer.org Community (website) as well as the graduate students from the BioHealth Informatics Department who provided support and input in different aspects of the work. This research received no specific grant from any funding agency in public, commercial, or not-for-profit sectors.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Esquivel</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Meric-Bernstam</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Bernstam</surname>
            <given-names>EV</given-names>
          </name>
        </person-group>
        <article-title>Accuracy and self correction of information received from an internet breast cancer list: content analysis</article-title>
        <source>BMJ</source>  
        <year>2006</year>  
        <month>04</month>  
        <day>22</day>  
        <volume>332</volume>  
        <issue>7547</issue>  
        <fpage>939</fpage>  
        <lpage>42</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/16513686"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/bmj.38753.524201.7C</pub-id>
        <pub-id pub-id-type="medline">16513686</pub-id>
        <pub-id pub-id-type="pii">bmj.38753.524201.7C</pub-id>
        <pub-id pub-id-type="pmcid">PMC1444809</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Elhadad</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>We Make Choices We Think Are Going to Save Us: Debate and Stance Identification for Online Breast Cancer</article-title>
        <source>Proceedings of 26th International Conference on World Wide Web Companion</source>  
        <year>2017</year>  
        <conf-name>26th International Conference on World Wide Web</conf-name>
        <conf-date>2017 April 3</conf-date>
        <conf-loc>Perth, Australia</conf-loc>
        <publisher-loc>We make choices we think are going to save us?</publisher-loc>
        <publisher-name>Debate and stance identification for online breast CAM discussion. Proc Int World Wide Web Conf. 2017 Apr</publisher-name>
        <fpage>1073</fpage>  
        <lpage>1081</lpage> </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>Office of the National Coordinator (ONC) for Health Information Technology</collab>
        </person-group>
        <source>HealthIT.gov</source>  
        <year>2018</year>  
        <month>03</month>  
        <day>21</day>  
        <access-date>2018-03-02</access-date>
        <publisher-loc>Washington, DC, USA</publisher-loc>
        <publisher-name>United States Department of Health and Human Services</publisher-name>
        <comment>Patient-Generated Health Data 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.healthit.gov/policy-researchers-implementers/patient-generated-health-data">https://www.healthit.gov/policy-researchers-implementers/patient-generated-health-data</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6xcjfJpno"/></comment> </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kostick</surname>
            <given-names>Kristin M</given-names>
          </name>
          <name name-style="western">
            <surname>Blumenthal-Barby</surname>
            <given-names>Jennifer S</given-names>
          </name>
          <name name-style="western">
            <surname>Wilhelms</surname>
            <given-names>Lidija A</given-names>
          </name>
          <name name-style="western">
            <surname>Delgado</surname>
            <given-names>Estevan D</given-names>
          </name>
          <name name-style="western">
            <surname>Bruce</surname>
            <given-names>Courtenay R</given-names>
          </name>
        </person-group>
        <article-title>Content Analysis of Social Media Related to Left Ventricular Assist Devices</article-title>
        <source>Circ Cardiovasc Qual Outcomes</source>  
        <year>2015</year>  
        <month>09</month>  
        <volume>8</volume>  
        <issue>5</issue>  
        <fpage>517</fpage>  
        <lpage>23</lpage>  
        <pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.115.002032</pub-id>
        <pub-id pub-id-type="medline">26219889</pub-id>
        <pub-id pub-id-type="pii">CIRCOUTCOMES.115.002032</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Asch</surname>
            <given-names>David A</given-names>
          </name>
          <name name-style="western">
            <surname>Rader</surname>
            <given-names>Daniel J</given-names>
          </name>
          <name name-style="western">
            <surname>Merchant</surname>
            <given-names>Raina M</given-names>
          </name>
        </person-group>
        <article-title>Mining the social mediome</article-title>
        <source>Trends Mol Med</source>  
        <year>2015</year>  
        <month>09</month>  
        <volume>21</volume>  
        <issue>9</issue>  
        <fpage>528</fpage>  
        <lpage>9</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26341614"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.molmed.2015.06.004</pub-id>
        <pub-id pub-id-type="medline">26341614</pub-id>
        <pub-id pub-id-type="pii">S1471-4914(15)00121-5</pub-id>
        <pub-id pub-id-type="pmcid">PMC4662876</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Attai</surname>
            <given-names>DJ</given-names>
          </name>
          <name name-style="western">
            <surname>Cowher</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Al-Hamadani</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Schoger</surname>
            <given-names>JM</given-names>
          </name>
          <name name-style="western">
            <surname>Staley</surname>
            <given-names>AC</given-names>
          </name>
          <name name-style="western">
            <surname>Landercasper</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Twitter social media is an effective tool for breast cancer patient education and support: patient-reported outcomes by survey</article-title>
        <source>Journal of Medical Internet Research</source>  
        <year>2015</year>  
        <volume>17</volume>  
        <issue>7</issue>  
        <fpage>e188</fpage>  
        <pub-id pub-id-type="pmcid">4705354</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
        <source>Breastcancer</source>  
        <access-date>2018-04-19</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://community.breastcancer.org/">https://community.breastcancer.org/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yna7qQb6"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>Breast Cancer Care</collab>
        </person-group>
        <source>Breast Cancer Care</source>  
        <year>2018</year>  
        <access-date>2018-04-19</access-date>
        <comment>Breast Cancer Care Forums 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://forums.breastcancercare.co.uk/">http://forums.breastcancercare.co.uk/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6ynb1syFD"/></comment> </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>Susan G Komen Foundation</collab>
        </person-group>
        <source>Susan G Komen Foundation (Website)</source>  
        <year>2018</year>  
        <access-date>2018-04-19</access-date>
        <comment>Message Boards 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://apps.komen.org/forums/">https://apps.komen.org/forums/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6ynephq1I"/></comment> </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>Triple Negative Breast Cancer Foundation</collab>
        </person-group>
        <source>Triple Negative Breast Cancer Foundation (Website)</source>  
        <year>2018</year>  
        <access-date>2018-04-19</access-date>
        <comment>Triple Negative Forums 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://forum.tnbcfoundation.org/">http://forum.tnbcfoundation.org/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6ynbiJjXf"/></comment> </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>No Surrender Breast Cancer Foundation</collab>
        </person-group>
        <source>Website</source>  
        <year>2018</year>  
        <access-date>2018-04-19</access-date>
        <comment>No Surrender Breast Cancer Support Forum 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://nosurrenderbreastcancersupportforum.com/">http://nosurrenderbreastcancersupportforum.com/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6yne8PLvp"/></comment> </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <collab>Institute of Medicine (National Academies of Science)</collab>
        </person-group>
        <source>Capturing Social and Behavioral Domains and Measures in Electronic Health Records (Phase 2)</source>  
        <year>2015</year>  
        <month>01</month>  
        <day>08</day>  
        <publisher-loc>Washington, DC, USA</publisher-loc>
        <publisher-name>National Academies Press (USA)</publisher-name></nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Ng</surname>
            <given-names>AY</given-names>
          </name>
          <name name-style="western">
            <surname>Jordan</surname>
            <given-names>MI</given-names>
          </name>
        </person-group>
        <article-title>Latent dirichlet allocation</article-title>
        <source>Journal of Machine Learning Research</source>  
        <year>2003</year>  
        <volume>2003</volume>  
        <issue>3</issue>  
        <fpage>993</fpage>  
        <lpage>1022</lpage> </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>McCallum</surname>
            <given-names>AK</given-names>
          </name>
        </person-group>
        <source>MALLET: A Machine Learning Language Toolkit (Software Tool)</source>  
        <year>2002</year>  
        <access-date>2017-04-17</access-date>
        <publisher-loc>Amherst, MA, USA</publisher-loc>
        <publisher-name>University of Massachusetts</publisher-name>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://mallet.cs.umass.edu/">http://mallet.cs.umass.edu/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6xcjxfqfW"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>R Development Core Team</collab>
          <name name-style="western">
            <surname>2008-2017</surname>
            <given-names>(Last accessed 24</given-names>
          </name>
        </person-group>
        <source>R: A language and environment for statistical computing [Software tool]</source>  
        <year>2008</year>  
        <access-date>2017-04-17</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.r-project.org/">http://www.r-project.org/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6xck6uupv"/>
        </comment> </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
