<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v5i3e23</article-id>
    <article-id pub-id-type="pmid">28760725</article-id>
    <article-id pub-id-type="doi">10.2196/medinform.7779</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>What Patients Can Tell Us: Topic Analysis for Social Media on Breast Cancer</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Zhang</surname>
          <given-names>Kunpeng</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Linares</surname>
          <given-names>Deborah</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Tuo</surname>
          <given-names>Shanshan</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1" corresp="yes">
      <name name-style="western">
        <surname>Tapi Nzali</surname>
        <given-names>Mike Donald</given-names>
      </name>
      <degrees>PhD</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <address>
        <institution>Institut Montpelliérain Alexander Grothendieck (IMAG)</institution>
        <institution>Department of Mathematics</institution>
        <institution>Montpellier University</institution>
        <addr-line>Place Eugène Bataillon</addr-line>
        <addr-line>Case Courrier 051</addr-line>
        <addr-line>Montpellier, 34095</addr-line>
        <country>France</country>
        <phone>33 4 67 41 86 72</phone>
        <fax>33 4 67 41 85 00</fax>
        <email>tapinzali@lirmm.fr</email>
      </address>  
      <xref rid="aff2" ref-type="aff">2</xref>
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-6245-5516</ext-link></contrib>
      <contrib contrib-type="author" id="contrib2">
        <name name-style="western">
          <surname>Bringay</surname>
          <given-names>Sandra</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-2830-3666</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib3">
        <name name-style="western">
          <surname>Lavergne</surname>
          <given-names>Christian</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-5826-1911</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib4">
        <name name-style="western">
          <surname>Mollevi</surname>
          <given-names>Caroline</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff4" ref-type="aff">4</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4827-3684</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib5">
        <name name-style="western">
          <surname>Opitz</surname>
          <given-names>Thomas</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff5" ref-type="aff">5</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-5863-5020</ext-link>
      </contrib>
    </contrib-group>
    <aff id="aff1">
    <sup>1</sup>
    <institution>Institut Montpelliérain Alexander Grothendieck (IMAG)</institution>
    <institution>Department of Mathematics</institution>  
    <institution>Montpellier University</institution>  
    <addr-line>Montpellier</addr-line>
    <country>France</country></aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>Laboratoire d'Informatique, de Robotique et de Microélectronique de Montpellier (LIRMM)</institution>
    <institution>Department of Computer Science</institution>  
    <institution>Montpellier University</institution>  
    <addr-line>Montpellier</addr-line>
    <country>France</country></aff>
    <aff id="aff3">
      <sup>3</sup>
      <institution>Paul Valery University</institution>
      <addr-line>Montpellier</addr-line>
      <country>France</country>
    </aff>
    <aff id="aff4">
    <sup>4</sup>
    <institution>Biometrics Unit</institution>
    <institution>Institut du Cancer Montpellier (ICM)</institution>  
    <addr-line>Montpellier</addr-line>
    <country>France</country></aff>
    <aff id="aff5">
    <sup>5</sup>
    <institution>BioSP Unit</institution>
    <institution>Institut National de la Recherche Agronomique (INRA)</institution>  
    <addr-line>Avignon</addr-line>
    <country>France</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Mike Donald Tapi Nzali 
      <email>tapinzali@lirmm.fr</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><season>Jul-Sep</season><year>2017</year></pub-date>
    <pub-date pub-type="epub">
      <day>31</day>
      <month>07</month>
      <year>2017</year>
    </pub-date>
    <volume>5</volume>
    <issue>3</issue>
    <elocation-id>e23</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>5</day>
        <month>4</month>
        <year>2017</year>
      </date>
      <date date-type="rev-request">
        <day>17</day>
        <month>5</month>
        <year>2017</year>
      </date>
      <date date-type="rev-recd">
        <day>16</day>
        <month>6</month>
        <year>2017</year>
      </date>
      <date date-type="accepted">
        <day>17</day>
        <month>6</month>
        <year>2017</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Mike Donald Tapi Nzali, Sandra Bringay, Christian Lavergne, Caroline Mollevi, Thomas Opitz. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 31.07.2017.</copyright-statement>
    <copyright-year>2017</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://medinform.jmir.org/2017/3/e23/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>Social media dedicated to health are increasingly used by patients and health professionals. They are rich textual resources with content generated through free exchange between patients. We are proposing a method to tackle the problem of retrieving clinically relevant information from such social media in order to analyze the quality of life of patients with breast cancer.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>Our aim was to detect the different topics discussed by patients on social media and to relate them to functional and symptomatic dimensions assessed in the internationally standardized self-administered questionnaires used in cancer clinical trials (European Organization for Research and Treatment of Cancer [EORTC] Quality of Life Questionnaire Core 30 [QLQ-C30] and breast cancer module [QLQ-BR23]).</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>First, we applied a classic text mining technique, latent Dirichlet allocation (LDA), to detect the different topics discussed on social media dealing with breast cancer. We applied the LDA model to 2 datasets composed of messages extracted from public Facebook groups and from a public health forum (cancerdusein.org, a French breast cancer forum) with relevant preprocessing. Second, we applied a customized Jaccard coefficient to automatically compute similarity distance between the topics detected with LDA and the questions in the self-administered questionnaires used to study quality of life.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>Among the 23 topics present in the self-administered questionnaires, 22 matched with the topics discussed by patients on social media. Interestingly, these topics corresponded to 95% (22/23) of the forum and 86% (20/23) of the Facebook group topics. These figures underline that topics related to quality of life are an important concern for patients. However, 5 social media topics had no corresponding topic in the questionnaires, which do not cover all of the patients’ concerns. Of these 5 topics, 2 could potentially be used in the questionnaires, and these 2 topics corresponded to a total of 3.10% (523/16,868) of topics in the cancerdusein.org corpus and 4.30% (3014/70,092) of the Facebook corpus.</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>We found a good correspondence between detected topics on social media and topics covered by the self-administered questionnaires, which substantiates the sound construction of such questionnaires. We detected new emerging topics from social media that can be used to complete current self-administered questionnaires. Moreover, we confirmed that social media mining is an important source of information for complementary analysis of quality of life.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>breast cancer</kwd>
      <kwd>text mining</kwd>
      <kwd>social media</kwd>
      <kwd>unsupervised learning</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Social media such as Facebook, Twitter, or Internet forums dedicated to health-related topics have evolved into easily accessible participatory tools for the exchange of knowledge, experience, and opinions through structured collections of text documents [<xref ref-type="bibr" rid="ref1">1</xref>]. Online health forums are used by patients to exchange information [<xref ref-type="bibr" rid="ref2">2</xref>]. Patients maintain their anonymity while discussing freely with other patients. Whereas communication with doctors and the medical staff in hospitals mainly revolve around technical issues of the disease and treatment, social media give patients access to more general exchanges of information, experiences, and mutual support among former and current patients [<xref ref-type="bibr" rid="ref3">3</xref>]. Such forums can therefore be considered as a valuable resource for the study of health-related quality of life (QoL). As shown by some studies (eg, [<xref ref-type="bibr" rid="ref4">4</xref>]), the anonymous environment of social media facilitates the unbiased expression of opinions and of feelings such as doubt or fear. Internet users have been shown to be primarily interested in specific information on health problems or diseases [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>] and in adopting a healthier lifestyle and looking for alternative points of view [<xref ref-type="bibr" rid="ref5">5</xref>]. Here we propose an approach to structure and evaluate clinically relevant information in narratives extracted from online health social media, with a focus on the QoL of patients with breast cancer.</p>
      <p>While constant progress in medical science leads to new treatments and improved chances to prolong lives, such treatments can be difficult to undergo. QoL can be considered as an alternative clinical end point in this context, moving the focus away from quantity to quality [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. QoL falls within the scope of patient-reported outcomes; that is, measures of perceived health [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. These measures must therefore be reported by patients themselves. For instance, alternative treatments such as palliative treatment of terminal cancer may be less efficient from a traditional clinical stance but may still be preferable with respect to the patients’ QoL [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Moreover, health economists must take into account the expense of treatments with respect to their effective benefits, for instance measured by the improvement in QoL (see Hirth et al [<xref ref-type="bibr" rid="ref16">16</xref>] and Cutler and McClellan [<xref ref-type="bibr" rid="ref17">17</xref>] for a general discussion, and Hillner and Smith [<xref ref-type="bibr" rid="ref18">18</xref>] for a cost-effectiveness study of chemotherapy in certain cases of breast cancer).</p>
      <p>Since QoL is a multidimensional, subjective, and culture-dependent concept, its quantification is not as straightforward, as shown in the literature review of Garratt et al [<xref ref-type="bibr" rid="ref19">19</xref>]. This concept includes at least physical, psychological, and social well-being, as well as symptoms related to illness and treatment. Today, QoL is assessed in cancer clinical trials by self-administered questionnaires developed by the European Organization for Research and Treatment of Cancer (EORTC). The EORTC Quality of Life Questionnaire Core 30 (QLQ-C30) [<xref ref-type="bibr" rid="ref20">20</xref>] is a generic self-administered questionnaire often associated with disease-specific modules, such as the EORTC breast cancer module (QLQ-BR23). The EORTC QLQ-C30 contains 30 items and evaluates 15 dimensions of QoL: 5 functional scales, 1 QoL and global health status scale, and 8 symptomatic scales, as well as 1 scale measuring the financial difficulties associated with the disease. The EORTC QLQ-BR23 contains 23 questions. It is usually administered with the EORTC QLQ-C30 and is designed to measure QoL for breast cancer patients at various stages and with different treatment modalities. The evaluation consists of 4 functional scales and 4 symptomatic scales. Usually, self-administered questionnaires evaluate functional and symptomatic dimensions and are filled in at a predefined time of the study protocol, such as at baseline, during treatment, and at follow-up. In this context, an advantage of social media is that they allow patients to leave a written trace of their sentiment at any time, therefore avoiding potential self-reporting bias owing to a change of perception due to time lag.</p>
      <p>Opitz et al [<xref ref-type="bibr" rid="ref21">21</xref>] developed an automated approach for the supervised detection of topics defined in QLQ-BR23 questionnaire items for cancerdusein.org, a French forum specialized in breast cancer. In this new work, we used an unsupervised method to discover topics covered by health social media. Unsupervised methods have been successfully applied to biomedical data. For example, Arnold and Speier [<xref ref-type="bibr" rid="ref22">22</xref>] presented a topic model tailored to the clinical reporting environment that allows for individual patient timelines. Lu et al [<xref ref-type="bibr" rid="ref23">23</xref>] used text clustering algorithms on social media data to discover health-related topics. Zhang et al [<xref ref-type="bibr" rid="ref24">24</xref>] applied a convolutional neural network classifier to an online breast cancer community and carried out a longitudinal analysis to show topic distributions and topic changes throughout the members’ participation. In our study, the main medical application was to help improve questionnaires by including new topics of interest for patients (topics frequently discussed by patients and the impact on QoL) as new items in the questionnaires.</p>
      <p>Researchers have developed several topic models, including latent semantic analysis [<xref ref-type="bibr" rid="ref25">25</xref>], probabilistic latent semantic analysis [<xref ref-type="bibr" rid="ref26">26</xref>], latent Dirichlet allocation (LDA) [<xref ref-type="bibr" rid="ref27">27</xref>], and latent semantic indexing [<xref ref-type="bibr" rid="ref28">28</xref>]. In this study, we defined a general process based on LDA [<xref ref-type="bibr" rid="ref27">27</xref>] and applied this model to social media. LDA, an unsupervised generative probabilistic method for modeling a corpus, is the most commonly used topic modeling method. The main disadvantage of LDA is that there are no objective metrics that justify the choice of the hyperparameters. However, the main advantage of LDA is that it is a probabilistic model with interpretable topics. Nowadays, a growing number of probabilistic models are based on LDA and dedicated to particular tasks. For example, Zhan et al [<xref ref-type="bibr" rid="ref29">29</xref>] used LDA to identify topics among posts generated by e-cigarette users in social media. Wang et al [<xref ref-type="bibr" rid="ref30">30</xref>] and Paul and Dredze [<xref ref-type="bibr" rid="ref31">31</xref>] constructed a specialized and advanced LDA model using biomedical terms to provide a more effective way of exploring the biomedical literature. LDA has also been successfully used for patient-generated data [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref36">36</xref>] and in particular for online breast cancer discussions [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Hao and Zhang [<xref ref-type="bibr" rid="ref37">37</xref>] used LDA to examine what Chinese patients said about their physicians in 4 major specialty areas. Hao et al [<xref ref-type="bibr" rid="ref38">38</xref>] used LDA to identify topics in positive and negative textual reviews of obstetricians and gynecologists from the 2 most popular online doctor rating websites in the United States and China. Yesha and Gangopadhyay [<xref ref-type="bibr" rid="ref39">39</xref>] described methods to identify topics and patterns within patient-generated data related to suicide and depression. LDA has also been used as a feature to build machine learning models to automatically identify the extent to which messages contain emotional and informational support on online health forums dealing with breast cancer [<xref ref-type="bibr" rid="ref40">40</xref>] or on Chinese social media [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
      <p>Conducting automated research as we have done here is of considerable interest for processing a large amount of text obtained from social media. The LDA approach for extracting topics allows for better targeting for information exploration, reducing search time, and treating topics as a flat set of probability distribution; it can also be used to recover a set of topics from a corpus. In this work, we only used the LDA model and tuned parameters to align the topics found with QoL questionnaires. The originality of our approach is to automatically relate the topics obtained with the LDA method to the questionnaire items with an adaptation of the Jaccard coefficient.</p>
      <p>In this study, the purpose of our approach was diverse: (1) to provide a nonconventional analysis of QoL from social media and put the topics identified with this nonconventional analysis into perspective with those of classical QoL questionnaires collected in clinical trials (in particular in breast cancer: EORTC QLQ-C30 and QLQ-BR23); (2) to apply the LDA model to patient data with relevant pretreatments; (3) to index the narratives with respect to topics extracted through an unsupervised statistical analysis of forum content and to predefined topics from questionnaires used in cancer clinical trials; and (4) to discover new topics directly from patients’ concerns that are not included in the current questionnaires used to evaluated QoL, with the possibility that these topics could be included in these questionnaires if sufficiently relevant.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data</title>
        <sec>
          <title>Data Description</title>
          <p>In this work, we used datasets from 2 different social media sources: cancerdusein.org and Facebook groups. <xref ref-type="table" rid="table1">Table 1</xref> summarizes statistics from these 2 datasets.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Number of users, threads, and posts on a social network and a health forum analyzed in this study.</p>
            </caption>
            <table width="578" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="163"/>
              <col width="186"/>
              <col width="186"/>
              <thead>
                <tr valign="top">
                  <td>Characteristics</td>
                  <td>Health forum (cancerdusein.org)</td>
                  <td>Social network (Facebook groups)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Date</td>
                  <td>October 2010-October 2014</td>
                  <td>October 2010-October 2014</td>
                </tr>
                <tr valign="top">
                  <td>No. of users</td>
                  <td>675</td>
                  <td>1394</td>
                </tr>
                <tr valign="top">
                  <td>No. of discussion threads</td>
                  <td>1050</td>
                  <td>11,013</td>
                </tr>
                <tr valign="top">
                  <td>No. of messages</td>
                  <td>16,868</td>
                  <td>70,092</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p>The first dataset contained the forum posts from cancerdusein.org, a French health forum with more than 16,000 posts. These posts cover a large number of topics related to health issues. This forum is recommended to patients in a brochure of the <italic>Institut National du Cancer</italic> (INCA), which is the French reference organization in oncology. The forum is recommended for patients to exchange information and find comfort and potential solutions to their problems. It serves as an online cancer support community, where cancer patients, cancer survivors, and their families share information about cancer and their conditions. The second dataset contains posts from groups on Facebook, one of the most well-known social networks. We extracted 70,092 posts from 4 different public groups or communities on Facebook: <italic>Cancer du sein</italic>, <italic>Octobre rose 2014</italic>, <italic>Cancer du sein - breast cancer</italic>, and <italic>brustkrebs</italic>. We collected data from groups focusing on the adult population (the targeted users) and in which users were very active.</p>
          <p>On both social media platforms, patients freely exchange information without the need for moderators to supervise discussions. New messages can either be added to an existing thread or be posted to open a new thread. In cancerdusein.org, a thread appears in exactly 1 of the 13 predefined subforums, for example, <italic>Discussion générale</italic> [general discussion], <italic>Vivre mon cancer au quotidien</italic> [daily life with my cancer], <italic>Les bonnes nouvelles</italic> [good news], or <italic>Récidives et combats au long cours</italic> [relapses and long-term battles]. In Facebook groups, there are no predefined topics to index the threads. Structuring topics according to the subforum structure is possible in cancerdusein.org, but this structure underlines the relatively uninformative and widely spread topics, covering a strongly unbalanced number of messages. Such indexing is not possible in Facebook groups. Interestingly, we propose to accomplish a finer analysis of topics in the next section, which further enables the presence of several topics within 1 message.</p>
        </sec>
        <sec>
          <title>Data Preprocessing</title>
          <p>Texts on social media are often strongly heterogeneous and noisy, with many deviations from standards of spelling, syntax, and abbreviations, which impede efficient natural language processing. The French language has a rich spelling and grammar, characterized by special characters such as <italic>ç</italic>, various kinds of accented vowels (eg, <italic>é</italic>, <italic>è</italic>, <italic>ê</italic>, <italic>ë</italic>, <italic>â</italic>, and <italic>à</italic>), and many flexional variants. Additional rules exist for linking subsequent terms in certain situations (eg, the contraction <italic>du</italic> formed from <italic>de+le</italic> and the contraction <italic>des</italic> formed from <italic>de+les)</italic>. As a consequence, automatic correction of text not obeying those rules is relatively difficult in practice. Furthermore, semantic analysis of texts is complicated by a large number of homonymy relationships: for example, <italic>pas</italic> can either mean <italic>step</italic> (noun) or can be the negation adverb <italic>not</italic>. As Balahur [<xref ref-type="bibr" rid="ref42">42</xref>] and Farzindar and Inkpen [<xref ref-type="bibr" rid="ref43">43</xref>] have pointed out, these linguistic peculiarities may affect classification performance. For this reason, we developed the following preprocessing steps.</p>
          <list list-type="bullet">
            <list-item>
              <p><italic>Removal of user tags.</italic> All user tags that have been identified in our corpus are removed, for example, @name, @surname.</p>
            </list-item>
            <list-item>
              <p><italic>Replacement of hyperlinks and email addresses.</italic> All the hypertext links are replaced by the term “link” and all the email addresses are replaced by the term “mail.” Hyperlinks (Internet, email, etc) are deleted. Emoticons are coded as :smile:, :sad:, etc.</p>
            </list-item>
            <list-item>
              <p><italic>Replacement of slang.</italic> Some expressions frequently used on social media, such as lol, mdr[lol], and xD, are removed.</p>
            </list-item>
            <list-item>
              <p><italic>Lemmatization.</italic> All words are lemmatized (using TreeTagger [<xref ref-type="bibr" rid="ref44">44</xref>]).</p>
            </list-item>
            <list-item>
              <p><italic>Lowercasing.</italic> Capitals letters are lowercased.</p>
            </list-item>
            <list-item>
              <p><italic>Removal of stopwords.</italic></p>
            </list-item>
            <list-item>
              <p><italic>Replacement of specific patient terms.</italic> The texts for the 2 corpora are usually highly focused on a specific domain (breast cancer, in our case). Most often, as patients are laypersons in the medical field, they use slang, abbreviations, and their own vocabulary during their exchanges. To automatically analyze text from social networks, we need a specific vocabulary. In this work, we use the vocabulary created by Tapi Nzali et al [<xref ref-type="bibr" rid="ref45">45</xref>] to replace the patients’ terms with biomedical terms used by health professionals and presented in shared medical resources. For example, <italic>crabe</italic> [crab] is replaced by <italic>cancer</italic>, <italic>onco</italic> is replaced by <italic>oncologue</italic> [oncologist].</p>
            </list-item>
            <list-item>
              <p><italic>Correction of spelling.</italic> Spelling correction is important to remove redundant dimensions of data and to improve part-of-speech tagging, which is the basis for many statistical and rule-based methods in natural language processing. We apply spelling correction based on specialized dictionaries constructed ad hoc and the open source tool GNU Aspell version 0.60.6.1, whose algorithm proposes a list of possible corrections for unknown terms from the corpus. We use the following ad hoc dictionaries: lists of breast cancer drugs and of secondary effects, and proper names extracted from forum metadata (usernames, user residence) and from narratives (terms with capital first letter not at the beginning of a sentence; usernames identified from salutations at the beginning of forum posts).</p>
            </list-item>
            <list-item>
              <p><italic>Extraction and deletion of forum pseudonyms.</italic> All the pseudonyms, previously extracted from each website, are used. The pseudonyms are extracted and deleted if they exist in the post.</p>
            </list-item>
          </list>
        </sec>
      </sec>
      <sec>
        <title>Unsupervised Topic Detection and Assigning</title>
        <sec>
          <title>Modeling Topics With Latent Dirichlet Allocation</title>
          <p>Today, detection of latent semantic structures and topics has become a very active field of research in the text mining community. We focused on the LDA model [<xref ref-type="bibr" rid="ref27">27</xref>], which has become a standard model for unsupervised topic detection from a text corpus. It is a probabilistic model with a hierarchical definition of its components. With the LDA model, we generated new documents from a given model. Based on the relatively simple and robust bag-of-words representation of text documents, it leaves the order of occurrence of terms and sentence structure out for consideration. For a given corpus of <italic>D</italic> documents, we first defined the relevant vocabulary <italic>V</italic>, a preprocessed collection of terms occurring in the corpus. Typical preprocessing steps include spelling correction, lemmatization, and the removal of noisy or irrelevant terms. To define a topic <italic>t</italic>, we associated a nonnegative weight ω<sub>ti</sub> with each of the vocabulary’s terms, <italic>w</italic><sub>i</sub>, so that weights summed up to 1 (∑<sup>V</sup><sub>i=1</sub> ω<sub>ti</sub>=1). In practice, each topic typically consisted of a relatively small number of terms with nonnegligible weight. An LDA model uses a fixed number <italic>K</italic>&#62;1 of topics. For each document <italic>d</italic>, weights ω<sub>dt</sub>≥0 indicate the occurrence probability of terms from topic <italic>t</italic>, where the sum of ω<sub>dt</sub> over all topics <italic>t</italic> yields 1 (∑<sup>K</sup><sub>t=1</sub> ω<sub>dt</sub>=1). If document <italic>d</italic> contains <italic>l</italic><sub>d</sub> terms (or “positions”), we associated a topic <italic>t</italic><sub>dj</sub> with each of the positions <italic>j</italic>=1,..., <italic>l</italic><sub>d</sub>, where the probability of associating topic <italic>t</italic> is α<sub>dt</sub>. Finally, each position was filled with a term, w<sub>dj</sub>, from the vocabulary, where the probability of using term <italic>w</italic><sub>i</sub> is ω<sub>tdj</sub>.</p>
          <p>The corpus-generation model is proposed by the algorithm shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p>
          <p>The principal information that we can learn from using such a model on a corpus of text data is the structure of represented topics and the distribution of topics over the documents contained in the corpus. The high number of unknown parameters in this model makes inference challenging, yet Bayesian techniques such as Gibbs sampling [<xref ref-type="bibr" rid="ref46">46</xref>] have proven reliable. Based on prior assumptions about the distribution of the weights of terms in topics and of topics in documents on a range from very uniform to very spiked, these inference techniques are applied to the data to estimate the posterior distributions of the model. Most importantly, the most likely topic structure and the occurrence probabilities for topics in each document are proposed. In this work, we considered a message as a document.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Algorithm proposing the corpus-generation model.</p>
            </caption>
            <graphic xlink:href="medinform_v5i3e23_fig1.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Crucial Model Parameters</title>
          <p>Besides <italic>K</italic>, 2 parameters often denoted as α and β strongly influence the distribution of topic probabilities for each of the messages. They are concentration parameters for the prior distributions of topics over a message (α) and of words over a topic (β). When α or β is smaller than 1 and decreases, prior mass concentrates closer and closer to the border of the simplex with spikes at each of its vertices. Then, 1 or fewer components (topics for α, words for β) carry strong probability in the mixture distribution. In the limit 0, a single component is selected with a probability of 1. On the contrary, when α or β is larger than 1 and increases, mass concentrates more and more in the barycenter of the simplex, leading to a mixture of the distribution, which is more and more balanced over all components. In the limit ∞, each component is selected with a probability of 1 over the number of components.</p>
          <p>Now we will explain our choice of α based on the influence of α on the distribution of topic probabilities for messages and of term distributions for topics. When α=1, the prior distribution for the vector of topic probabilities corresponds to a uniform distribution on the simplex with <italic>K</italic> vertices. As α increases, the distribution concentrates more and more strongly toward the center of the simplex, such that most of the probabilities are closer to 1/ <italic>K</italic>. As α decreases, it concentrates more and more strongly toward the vertices, leading to some probabilities being further away from 1/ <italic>K</italic>. For fixed α, probabilities concentrate more and more around 1/ <italic>K</italic> as <italic>K</italic> increases. In Griffiths and Steyvers [<xref ref-type="bibr" rid="ref47">47</xref>], values α=α<sub>0</sub>/ <italic>K</italic> with the constant α<sub>0</sub>=50 are encouraged, where dividing through <italic>K</italic> constantly keeps a certain complexity measure of the model. Exploratory analysis showed that α<sub>0</sub>=50 led to very flat probability vectors in our case, which made it difficult to attribute a small number of topics for indexation to each message. On the other hand, smaller values of α<sub>0</sub> led to topics becoming more difficult to interpret due to flatter distribution of term probabilities within topics and similar dominating terms in multiple topics. After careful analysis of topics and posterior distributions for a range of values of α<sub>0</sub>, we decided to fix α<sub>0</sub>=10. Whereas higher values of α<sub>0</sub> yielded a better fit of the model in terms of its likelihood, it led to very flat posterior probabilities for the topic distribution of messages. As in Griffiths and Steyvers [<xref ref-type="bibr" rid="ref47">47</xref>], we decided to fix the value of parameter β to 0.1 for our experiments.</p>
          <p>There is evidence [<xref ref-type="bibr" rid="ref48">48</xref>] that automatic choice of parameters through a model selection criterion may result in an unsatisfactory topic collection, whose interpretation is more challenging than topics associated with suboptimal values of the criterion. Often, the calculation of held-out likelihood is used, allowing for approaches such as likelihood cross-validation. However, the likelihood calculation is not trivial, and some standard methods produce inaccurate results (see [<xref ref-type="bibr" rid="ref49">49</xref>]).</p>
        </sec>
        <sec>
          <title>Vocabulary Definition</title>
          <p>To avoid noisy topics that are difficult to interpret, it is useful to focus on terms with potential medical relevance. Here, we defined terms as sequences of words, and often there was only a single word. To begin, we used terms indexed in the French version of the Medical Subject Headings (MeSH) [<xref ref-type="bibr" rid="ref50">50</xref>]. Then we added terms figuring in a list of breast cancer drugs (extracted from the online resource) or appearing in a list of nonconventional treatments (extracted from the French Wikipedia entry). We denoted this term set as <italic>MED</italic>. We retained 481,111 occurrences of 18,672 terms in 16,868 messages on cancerdusein.org, and 626,043 occurrences of 18,741 terms in 70,092 messages on Facebook. The resulting topics, often strongly dominated by a single term, appeared to be rather difficult to interpret by clinical experts, possibly due to the relatively small dimension of the term-document space. We categorized terms figuring in the representative terms according to their grammatical role: nouns/proper names (<italic>NN</italic>), verbs (<italic>V</italic>), and adjectives (<italic>A</italic>). Then, we extracted topics by applying LDA to the original <italic>MED</italic> term set, extended by terms according to scenarios <italic>MED+NN+V+A</italic>. Based on the exploratory inspection of topics extracted by LDA in the approaches presented in the following, we further removed a small number of strongly represented terms leading to strong noise (<italic>femme</italic> [woman], <italic>temps</italic> [time or weather]), and medically meaningless topics.</p>
        </sec>
      </sec>
      <sec>
        <title>Align Topics and Questionnaires</title>
        <p>With the topics returned by the LDA model, we automatically identified correspondences between the topics and the questionnaires, as shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. To align topics and questionnaires, we computed a distance between each question <italic>q</italic><sub>j</sub> and all topics <italic>t</italic><sub>i</sub> in <italic>T.</italic> We kept the topic with the higher distance. To compute the distance between an LDA topic and an item of the questionnaire, we customized the Jaccard coefficient [<xref ref-type="bibr" rid="ref51">51</xref>] by taking into account the probability of the words obtained with the LDA model, as shown in <xref ref-type="fig" rid="figure3">Figure 3</xref> (equation 1).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Automatic identification of correspondences between topics and questionnaires. LDA: latent Dirichlet allocation; <italic>MED</italic> + <italic>NN</italic> + <italic>V</italic> + <italic>A</italic>: set of medically relevant terms (<italic>MED</italic>) extended by terms categorized by their grammatical role (<italic>NN</italic>: nouns and proper names; <italic>V</italic>: verbs; <italic>A</italic>: adjectives).</p>
          </caption>
          <graphic xlink:href="medinform_v5i3e23_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Equation to calculate the distance between a latent Dirichlet allocation topic and an item of the questionnaire.</p>
          </caption>
          <graphic xlink:href="medinform_v5i3e23_fig3.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Topic Modeling Result</title>
        <p>To run experiments, we used the R package LDA [<xref ref-type="bibr" rid="ref52">52</xref>] and the R environment version 3.2.5 (R Foundation) for the implementation. We tested different scenarios, and an expert validated and labeled the topics and verified the association between topics and questionnaires items. The expert is a biostatistician and QoL researcher in the cancer field [<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref54">54</xref>].</p>
        <p>In scenario <italic>MED</italic> + <italic>NN</italic>, most of the topics were of a factual nature, whereas scenario <italic>MED</italic> + <italic>NN</italic> + <italic>V</italic> led to a more complete description of topics, where verbs often add information about actions undertaken by users and other stakeholders (wait, consult, seek, support, etc) and about user sentiment (feel, cry, tire, fear, accept, etc). In scenario <italic>MED</italic> + <italic>NN</italic> + <italic>V</italic> + <italic>A</italic>, several topics consisting mainly of emotional words were difficult to interpret from a medical point of view. We reported the stability of the majority of topics that were identified through the scenarios <italic>MED</italic> + <italic>NN</italic>, <italic>MED</italic> + <italic>NN</italic> + <italic>V</italic>, and <italic>MED</italic> + <italic>NN</italic> + <italic>V</italic> + <italic>A</italic> due to the similarity of dominating terms. After careful analysis, we narrowed down the choice of <italic>K</italic> to a value between 20 and 30. With more than 20 topics, we found duplication of topics (2 topics may deal with the same subject). In addition, some are unable to be interpreted (the medical expert found no meaning). Consequently, we decided to retain scenario <italic>MED</italic> + <italic>NN</italic> + <italic>V</italic> + <italic>A</italic> with 20 topics. Finally, we fixed <italic>K</italic>=20 for the duration of this study. For each topic, we showed only 20 keywords having higher probabilities under that topic. These keywords were presented to the expert. <xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="table" rid="table3">Table 3</xref> list the topic modeling results of the 2 corpora. We show the top 10 keywords for each topic. <xref ref-type="table" rid="table4">Table 4</xref> shows the results of the 20 topics interpreted by the medical expert on the 2 corpora.</p>
       
       
      </sec>
      <sec>
        <title>Relationships Between Questionnaire Topics</title>
        <p>In this work, we used 2 QoL questionnaires (EORTC QLQ-C30 and EORTC QLQ-BR23) to look for relationships between the studied dimensions in these previous questionnaires and topics that we interpreted. The EORTC QLQ-C30 is a 30-item, self-administered, cancer-specific questionnaire designed to measure QoL in the cancer population. The assessment comprises 5 functional scales (physical, role, cognitive, emotional, and social), 8 symptomatic scales (fatigue, nausea and vomiting, pain, dyspnea, insomnia, loss of appetite, constipation, and diarrhea), and 1 scale measuring financial difficulties and 1 measuring global health status and QoL by a score ranging from 0 to 100 through the 30 items [<xref ref-type="bibr" rid="ref20">20</xref>]. The EORTC QLQ-BR23 is a 23-item, self-administered, breast cancer-specific questionnaire, usually administered with the EORTC QLQ-C30, designed to measure QoL in the breast cancer population at various stages and with patients with differing treatment modalities. The assessment comprises 4 functional scales (body image, sexual functioning, sexual enjoyment, and future perspective) and 4 symptomatic scales (systemic therapy side effects, breast symptoms, arm symptoms, and hair loss) [<xref ref-type="bibr" rid="ref55">55</xref>]. The EORTC health-related QoL questionnaires are built on a Likert scale with polytomous items.</p>
        <p>To find the theme corresponding to a question, we used equation 1 (<xref ref-type="fig" rid="figure3">Figure 3</xref>) proposed above. We obtained the following relationships:</p>
        <list list-type="bullet">
          <list-item>
            <p>Topic <italic>sexuality</italic> is related to items 44 (To what extent were you interested in sex?) and 45 (To what extent were you sexually active?).</p>
          </list-item>
          <list-item>
            <p>Topic <italic>hair loss</italic> is related to item 34 (Have you lost any hair?).</p>
          </list-item>
          <list-item>
            <p>Topic <italic>body care and body image during cancer</italic> is related to items 39 (Have you felt physically less attractive as a result of your disease or treatment?) and 40 (Have you been feeling less feminine as a result of your disease or treatment?).</p>
          </list-item>
        </list>
        <p>These relationships were validated by a medical expert. Following validation of the results, we calculated the precision. On cancerdusein.org data, for the 53 items, 39 relationships with topics were validated by the medical expert and 14 were invalidated, for a precision of 74%. On Facebook data, for the 53 items, 36 relationships were validated by the medical expert and 17 were invalidated, for a precision of 68%. The medical expert also manually examined the invalidated relationships. This step reduced the time spent by the expert to find relationships between the questions and the topics. The obtained precision rates can be explained by the fact that the items of the questionnaires are composed of very short sentences. On average, these sentences contain fewer than 5 words.</p>

 <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Top 10 frequently occurring words for the first 10 topics (among the 20 found) on cancerdusein.org forum data.</p>
          </caption>
          <table width="767" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="38"/>
            <col width="304"/>
            <col width="250"/>
            <col width="117"/>
            <thead>
              <tr valign="top">
                <td rowspan="2">Topic no.</td>
                <td colspan="2">Top 10 words with their translation</td>
                <td rowspan="2">Topic label<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>French</td>
                <td>English translation</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td><italic>cheveu</italic>, <italic>perdre</italic>, <italic>perruque</italic>, <italic>tomber</italic>, <italic>tête</italic>, <italic>commencer</italic>, <italic>repousser</italic>, <italic>chimiothérapie</italic>, <italic>perte</italic>, <italic>foulard</italic></td>
                <td>hair, lose, wig, fall, head, begin, regrowth, chemotherapy, loss, scarf</td>
                <td>Hair loss</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td><italic>prendre</italic>, <italic>temps</italic>, <italic>travail</italic>, <italic>demander</italic>, <italic>soin</italic>, <italic>reprendre</italic>, <italic>charge</italic>, <italic>travailler</italic>, <italic>aide</italic>, <italic>payer</italic></td>
                <td>take, time, job, ask, care, restart, charge, work, help, pay</td>
                <td>Work life during cancer and financial aspects</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td><italic>effet</italic>, <italic>chimiothérapie</italic>, <italic>secondaire</italic>, <italic>cure</italic>, <italic>douleur</italic>, <italic>passer</italic>, <italic>mammographie</italic>, <italic>nausée</italic>, <italic>docétaxel</italic>, <italic>fatigue</italic></td>
                <td>effect, chemotherapy, secondary, treatment, pain, pass, mammography, nausea, docetaxel, fatigue</td>
                <td>Chemotherapy and its secondary effects</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td><italic>prendre</italic>, <italic>effet</italic>, <italic>douleur</italic>, <italic>traitement</italic>, <italic>problème</italic>, <italic>tamoxifène</italic>, <italic>prise</italic>, <italic>penser</italic>, <italic>secondaire</italic>, <italic>arrêter</italic></td>
                <td>take, effect, pain, treatment, problem, tamoxifen, catch, think, secondary, stop</td>
                <td>Hormone therapy and its secondary effects</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td><italic>sein</italic>, <italic>bras</italic>, <italic>chirurgie</italic>, <italic>reconstruction</italic>, <italic>opération</italic>, <italic>douleur</italic>, <italic>prothèse</italic>, <italic>opérer</italic>, <italic>enlever</italic>, <italic>cicatrice</italic></td>
                <td>breast, arm, surgery, reconstruction, operation, pain, prosthesis, operate, remove, scar</td>
                <td>Breast reconstruction</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td><italic>baiser</italic>, <italic>petit</italic>, <italic>beau</italic>, <italic>super</italic>, <italic>attendre</italic>, <italic>soutien</italic>, <italic>nouveau</italic>, <italic>guerrier</italic>, <italic>grand</italic>, <italic>vérité</italic></td>
                <td>kiss, little, beautiful, great, wait, support, new, warrior, big, truth</td>
                <td>Support from patient’s family and friends</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td><italic>ongle</italic>, <italic>peau</italic>, <italic>radiothérapie</italic>, <italic>main</italic>, <italic>séance</italic>, <italic>pied</italic>, <italic>rayon</italic>, <italic>brûlure</italic>, <italic>crème</italic>, <italic>conseil</italic></td>
                <td>nail, skin, radiotherapy, hand, session, foot, radius, burn, cream, council</td>
                <td>Radiotherapy and its secondary effects</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td><italic>prendre</italic>, <italic>manger</italic>, <italic>boire</italic>, <italic>essayer</italic>, <italic>miel</italic>, <italic>aider</italic>, <italic>produit</italic>, <italic>demander</italic>, <italic>santé</italic>, <italic>complément</italic></td>
                <td>take, eat, drink, try, honey, help, product, ask, health, complement</td>
                <td>Complementary and alternative medicine</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td><italic>lire</italic>, <italic>forum</italic>, <italic>message</italic>, <italic>venir</italic>, <italic>nouveau</italic>, <italic>donner</italic>, <italic>trouver</italic>, <italic>site</italic>, <italic>réponse</italic>, <italic>écrire</italic></td>
                <td>read, forum, message, come, new, give, find, site, response, write</td>
                <td>Media and forum information exchange</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td><italic>homonymie</italic>, <italic>enfant</italic>, <italic>fille</italic>, <italic>maman</italic>, <italic>vie</italic>, cancer, <italic>vérité</italic>, <italic>vivre</italic>, <italic>malade</italic>, <italic>famille</italic></td>
                <td>homonymy, child, girl, mom, life, cancer, truth, live, sick, family</td>
                <td>Family background and breast cancer</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Topic label was assigned by a medical expert.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>

 <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Top 10 frequently occurring words for the first 10 topics (among the 20 topics found) on Facebook data.</p>
          </caption>
          <table width="759" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="42"/>
            <col width="296"/>
            <col width="246"/>
            <col width="117"/>
            <thead>
              <tr valign="top">
                <td rowspan="2">Topic no.</td>
                <td colspan="2">Top 10 words</td>
                <td rowspan="2">Topic label<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>French</td>
                <td>English translation</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td><italic>voir</italic>, <italic>attendre</italic>, <italic>résultat</italic>, <italic>médecin</italic>, <italic>oncologie</italic>, <italic>examen</italic>, <italic>biopsie</italic>, <italic>mammographie</italic>, <italic>contrôle</italic>, <italic>scanner</italic></td>
                <td>see, wait, result, doctor, oncology, examination, biopsy, mammography, test, scanner</td>
                <td>Diagnosis</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td><italic>douleur</italic>, <italic>effet</italic>, <italic>chimiothérapie</italic>, <italic>secondaire</italic>, <italic>jour</italic>, <italic>prendre</italic>, <italic>mal</italic>, <italic>fatigue</italic>, <italic>nausée</italic>, <italic>chaleur</italic></td>
                <td>pain, effect, chemotherapy, secondary, day, take, bad, fatigue, nausea, heat</td>
                <td>Chemotherapy and its secondary effects</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td><italic>justice</italic>, <italic>moral</italic>, <italic>garder</italic>, <italic>aller</italic>, <italic>fort</italic>, <italic>dureté</italic>, <italic>battre</italic>, <italic>étape</italic>, <italic>force</italic>, <italic>combat</italic></td>
                <td>justice, morale, keep, go, strong, hardness, beat, step, strength, fight</td>
                <td>Breast cancer as a daily battle</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td><italic>cheveu</italic>, <italic>perdre</italic>, <italic>tomber</italic>, <italic>repousser</italic>, <italic>perruque</italic>, <italic>couper</italic>, <italic>raser</italic>, <italic>tête</italic>, <italic>joli</italic>, <italic>foulard</italic></td>
                <td>hair, lose, fall, growth, wig, cut, shave, head, beautiful, scarf</td>
                <td>Hair loss</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td><italic>prendre</italic>, <italic>suivre</italic>, <italic>dire</italic>, <italic>soin</italic>, <italic>arrêter</italic>, <italic>traitement</italic>, <italic>tamoxifène</italic>, <italic>poids</italic>, <italic>perdre</italic>, <italic>homonymie</italic></td>
                <td>take, follow, tell, care, stop, treatment, tamoxifen, weight, lose, homonymy</td>
                <td>Secondary effect of treatment</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td><italic>aller</italic>, <italic>justice</italic>, <italic>passer</italic>, <italic>sexologie</italic>, <italic>allergologie</italic>, <italic>baiser</italic>, <italic>penser</italic>, <italic>meilleur</italic>, <italic>voir</italic>, <italic>reposer</italic></td>
                <td>go, justice, pass, sexology, allergology, kiss, think, best, see, rest</td>
                <td>Body care and body image during cancer</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td><italic>homonymie</italic>, <italic>dire</italic>, <italic>vérité</italic>, <italic>suivre</italic>, <italic>peur</italic>, <italic>sexologie</italic>, <italic>comprendre</italic>, <italic>croire</italic>, <italic>dureté</italic>, <italic>enfant</italic></td>
                <td>homonymy, tell, truth, follow, fear, sexology, understand, believe, hardness, child</td>
                <td>Family background and breast cancer</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td><italic>demander</italic>, <italic>suivre</italic>, <italic>droit</italic>, <italic>travail</italic>, <italic>aide</italic>, <italic>médecin</italic>, <italic>payer</italic>, <italic>charge</italic>, <italic>travailler</italic>, <italic>donner</italic></td>
                <td>ask, follow, law, job, help, doctor, pay, charge, work, give</td>
                <td>Work life during cancer and financial aspects</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td><italic>sein</italic>, <italic>opération</italic>, <italic>reconstruction</italic>, <italic>enlever</italic>, <italic>bras</italic>, <italic>opérer</italic>, <italic>mastectomie</italic>, <italic>cicatrice</italic>, <italic>retirer</italic>, <italic>prothèse</italic></td>
                <td>breast, operation, reconstruction, remove, arm, operate, mastectomy, scar, withdraw, prosthesis</td>
                <td>Breast reconstruction</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td><italic>suivre</italic>, <italic>aller</italic>, <italic>fille</italic>, <italic>sol</italic>, <italic>voir</italic>, <italic>rire</italic>, <italic>regarder</italic>, <italic>marier</italic>, <italic>croire</italic>, <italic>lire</italic></td>
                <td>follow, go, girl, ground, see, laugh, look, marry, believe, read</td>
                <td>Support from patient’s family and friends</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Topic label was assigned by a medical expert.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>List of identified topic titles with <italic>K</italic>=20 in collaboration with an expert.</p>
          </caption>
          <table width="620" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="42"/>
            <col width="254"/>
            <col width="280"/>
            <thead>
              <tr valign="top">
                <td>Topic no.</td>
                <td>cancerdusein.org</td>
                <td>Facebook</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>Hair loss</td>
                <td>Diagnosis</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>Work life during cancer and financial aspects</td>
                <td>Chemotherapy and its secondary effects</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>Chemotherapy and its secondary effects</td>
                <td>Breast cancer as a daily battle</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>Hormone therapy and its secondary effects</td>
                <td>Hair loss</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>Breast reconstruction</td>
                <td>Secondary effects of treatments</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>Support from patient’s family and friends</td>
                <td>Body care and body image during cancer</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>Radiotherapy and its secondary effects</td>
                <td>Family background and breast cancer</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>Complementary and alternative medicine</td>
                <td>Work life during cancer and financial aspects</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>Media and forum information exchange</td>
                <td>Breast reconstruction</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>Family members with breast cancer</td>
                <td>Support from patient’s family and friends</td>
              </tr>
              <tr valign="top">
                <td>11</td>
                <td>Treatment period</td>
                <td>Interaction with nurses and doctors</td>
              </tr>
              <tr valign="top">
                <td>12</td>
                <td>Everyday life during cancer</td>
                <td>Anxiety and fatigue</td>
              </tr>
              <tr valign="top">
                <td>13</td>
                <td>Healing</td>
                <td>Healing of family member</td>
              </tr>
              <tr valign="top">
                <td>14</td>
                <td>Search for medical information</td>
                <td>Relapse</td>
              </tr>
              <tr valign="top">
                <td>15</td>
                <td>Mourning</td>
                <td>Sexuality</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>Diagnosis</td>
                <td>Body care and body image during cancer</td>
              </tr>
              <tr valign="top">
                <td>17</td>
                <td>Breast cancer as a daily battle</td>
                <td>Family members with breast cancer</td>
              </tr>
              <tr valign="top">
                <td>18</td>
                <td>Body care and body image during cancer and sexuality</td>
                <td>Healing</td>
              </tr>
              <tr valign="top">
                <td>19</td>
                <td>Surgery</td>
                <td>Support from patient’s family and friends</td>
              </tr>
              <tr valign="top">
                <td>20</td>
                <td>Waiting for results of analysis, concerns</td>
                <td>Treatment period</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>




        <p><xref ref-type="table" rid="table5">Table 5</xref> shows the relationships between topics from questionnaires and those we found in the 2 corpora. The first column lists the topics of the 2 questionnaires, with the corresponding questionnaires items shown in column 2. Columns 3 and 4 give the corresponding topics obtained with LDA in the 2 corpora. <xref ref-type="table" rid="table6">Table 6</xref> shows the percentage of documents belonging to each topic in cancerdusein.org and Facebook. We noticed that the numbers of messages belonging to each topic are almost equal; this shows the importance of all the topics that we found and that were discussed by patients.</p>
        <sec>
          <title>Data From cancerdusein.org</title>
          <p>We succeeded in interpreting the 20 topics obtained from the output of our model on the cancerdusein.org corpus. <xref ref-type="table" rid="table2">Table 2</xref> presents the 10 first topics and the top 10 words obtained by our model that were interpreted by an expert. Some relationships were established. In the QLQ-C30, we found matches for all of the topics except for global health status and QoL. In the QLQ-BR23 form, we matched all of the topics.</p>
        </sec>
        <sec>
          <title>Data From Facebook</title>
          <p>We succeeded in interpreting the 20 topics obtained from the output of our model on the Facebook corpus. <xref ref-type="table" rid="table3">Table 3</xref> presents the 10 first topics and the top 10 words obtained by our model that were interpreted by an expert. Some relationships were established. In the QLQ-C30, we found matches for all of the topics except for role functioning, cognitive functioning, and global health status and QoL. In the QLQ-BR23 form, we matched all of the topics.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>We have presented what we believe to be the first study of health social media data in French, as a potential source of analysis of the QoL for breast cancer patients. We used accurate machine learning models to identify topics discussed in online breast cancer support groups. Then we examined the relationships between the discovered topics and studied dimensions from QoL self-administered questionnaires. Exploratory and in-depth analysis of these data is a potential source of candid information as an alternative to analysis of QoL based on self-administered questionnaires.</p>
      <sec>
        <title>Limitations</title>
        <sec>
          <title>Patient-Authored Text</title>
          <p>The first limitation of this study is the type of users, which produced the patient-authored text exploited in our process. Indeed, unless a group has formal gatekeeping of members, it is difficult to know for sure whether people posting to a forum or in a Facebook group are patients, survivors, health care professionals, care providers, family, or friends of patients. Consequently, topics extracted with our method may have been generated by users who do not have breast cancer. In particular, it has been known for decades that health information is sought principally by friends or family members, and then after that by patients [<xref ref-type="bibr" rid="ref56">56</xref>]. In this work, we assumed that the relatives’ topics of interest were similar to patients’ topics of interest. However, in a previous work [<xref ref-type="bibr" rid="ref57">57</xref>], we proposed a method to automatically deduce the role of the forum user. This method can be used at the beginning of our chain to exclude the posts of individuals who are not actual patients.</p>
        </sec>
        <sec>
          <title>Generalization of the Method</title>
          <p>The second limitation is that we harvested data from only 1 forum and different Facebook groups. However, this forum is frequently recommended by French physicians to patients. It is also recommended by INCA, which is the French reference organization in oncology. We deliberately selected this forum and these Facebook groups to examine similarities and differences within and between these 2 particular communities. Of course, there are certainly many other online communities related to breast cancer, and the users in these 2 online communities were not necessarily representative of users of all breast cancer social media.</p>
          <p>It is also important to note that our method can be easily applied to other diseases. For example, we can (1) use brain cancer forum data to align topics discussed by patients with items of the EORTC QLQ-C30 and the brain cancer module (QLQ-BN20) [<xref ref-type="bibr" rid="ref58">58</xref>] questionnaires, and (2) use lung cancer forum data to align topics discussed by patients with items of the QLQ-C30 and the lung cancer module (QLQ-LC13) [<xref ref-type="bibr" rid="ref59">59</xref>] We have already also applied a similar approach to study other social media data such as Twitter [<xref ref-type="bibr" rid="ref60">60</xref>]. The main adaptation is relative to the acquisition of the patient terms, which are specific to the disease and the social media as mentioned in the Data Preprocessing section above.</p>
        </sec>
        <sec>
          <title>Latent Dirichlet Allocation Model</title>
          <p>A third limitation was the choice of LDA. LDA requires much manual tuning of its parameters, which vary from task to task. We spent a lot of time finding the best parameters so that the results could be interpreted meaningfully. Such analysis makes itself a sort of “overfitting” to the task at hand, making it very hard to generalize the method to other datasets and other tasks. However, we efficiently defined parameters of 2 types of text (forum and Facebook posts), which can be reused for other studies on comparable corpora.</p>
          <p>Topics covered on social media focused on a specific domain, breast cancer. It was difficult to adjust the number of topics because topics were closed: all of the users were discussing breast cancer. When we adjusted the model and sought the optimal <italic>K</italic> with methods such as those used in other studies (eg, [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]), we obtained more than 50 topics. An interesting perspective was using the heuristic approach defined by Zhao et al [<xref ref-type="bibr" rid="ref63">63</xref>] to determine an appropriate number of topics. This method is based on the rate of perplexity change [<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref64">64</xref>]. This measure is commonly used in information theory to evaluate how well a statistical model describes a dataset, with lower perplexity denoting a better probabilistic model [<xref ref-type="bibr" rid="ref63">63</xref>]. Finally, as in Arnold et al [<xref ref-type="bibr" rid="ref65">65</xref>], we observed that an expert is not able to interpret so many topics. In this study, we manually fixed <italic>K</italic>=20. We interpreted all the topics with minimal redundancies.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Distribution of documents on each topic on cancerdusein.org and Facebook.</p>
            </caption>
            <table width="828" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="49"/>
              <col width="54"/>
              <col width="141"/>
              <col width="92"/>
              <col width="213"/>
              <col width="193"/>
              <thead>
                <tr valign="top">
                  <td colspan="3">Questionnaires and their scales</td>
                  <td>Questionnaire items</td>
                  <td>cancerdusein.org</td>
                  <td>Facebook</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="6"><bold>EORTC QLQ-C30<sup>a</sup></bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td colspan="2"><bold>Functional scales</bold></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Physical functioning</td>
                  <td>1-5</td>
                  <td>Everyday life during cancer</td>
                  <td>Treatment period</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Treatment period</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Role functioning</td>
                  <td>6, 7</td>
                  <td>Everyday life during cancer</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Emotional functioning</td>
                  <td>21-24</td>
                  <td>Diagnosis</td>
                  <td>Diagnosis</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Breast cancer as a daily battle</td>
                  <td>Breast cancer as a daily battle</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Waiting for results of analysis, concerns</td>
                  <td>Anxiety and fatigue</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Support from patient’s family and friends</td>
                  <td>Support from patient’s family and friends</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Cognitive functioning</td>
                  <td>20, 25</td>
                  <td>Search for medical information</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Media and forum information exchange</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Social functioning</td>
                  <td>26, 27</td>
                  <td>Support from patient’s family and friends</td>
                  <td>Support from patient’s family and friends</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Work life during cancer and financial aspects</td>
                  <td>Work life during cancer and financial aspects</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td colspan="2"><bold>Symptom scales</bold></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Fatigue</td>
                  <td>10, 12, 18</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Anxiety and fatigue</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Nausea and vomiting</td>
                  <td>14, 15</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Pain</td>
                  <td>9, 19</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td rowspan="2">Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Surgery</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Dyspnea</td>
                  <td>8</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Insomnia</td>
                  <td>11</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Appetite loss</td>
                  <td>13</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Constipation</td>
                  <td>16</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Diarrhea</td>
                  <td>17</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Financial difficulties</td>
                  <td>28</td>
                  <td>Work life during cancer and financial aspects</td>
                  <td>Work life during cancer and financial aspects</td>
                </tr>
                <tr valign="top">
                  <td colspan="3"><bold>Global health status</bold></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td colspan="2">Global health status and quality of life</td>
                  <td>29, 30</td>
                  <td><break/></td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td colspan="6"><bold>EORTC QLQ-BR23<sup>b</sup></bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td colspan="2"><bold>Functional scales</bold></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Body image</td>
                  <td>39-42</td>
                  <td>Breast reconstruction</td>
                  <td>Breast reconstruction</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Body care and body image during cancer, and sexuality</td>
                  <td>Body care and body image during cancer</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Surgery</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Sexual functioning</td>
                  <td>44, 45</td>
                  <td>Body care and body image during cancer, and sexuality</td>
                  <td>Sexuality</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Sexual enjoyment</td>
                  <td>46</td>
                  <td>Body care and body image during cancer, and sexuality</td>
                  <td>Sexuality</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Future perspectives</td>
                  <td>43</td>
                  <td rowspan="2">Healing</td>
                  <td>Healing</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Relapse</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td colspan="2"><bold>Symptom scales</bold></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Systemic therapy</td>
                  <td>31-34</td>
                  <td>Chemotherapy and its secondary effects</td>
                  <td>Secondary effects of treatments</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Side effects</td>
                  <td>36-38</td>
                  <td>Hormone therapy and its secondary effects</td>
                  <td>Chemotherapy and its secondary effects</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Breast symptoms</td>
                  <td>50-53</td>
                  <td>Breast reconstruction</td>
                  <td>Breast reconstruction</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Radiotherapy and its secondary effects</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Surgery</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Arm symptoms</td>
                  <td>47-49</td>
                  <td>Breast reconstruction</td>
                  <td>Breast reconstruction</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Surgery</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td>Hair loss</td>
                  <td>35</td>
                  <td>Hair loss</td>
                  <td>Hair loss</td>
                </tr>
                <tr valign="top">
                  <td colspan="6"><bold>Topics without a relationship</bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Complementary and alternative medicine</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Mourning</td>
                  <td><break/></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Family background and breast cancer</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Family members with breast cancer</td>
                  <td>Family members with breast cancer</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td><break/></td>
                  <td>Healing of family member</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>EORTC QLQ-C30: European Organization for Research and Treatment of Cancer Quality of Life Questionnaire Core 30.</p>
              </fn>
              <fn id="table5fn2">
                <p><sup>b</sup>QLQ-BR23: breast cancer module.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>Distribution of documents in each topic on cancerdusein.org and Facebook.</p>
            </caption>
            <table width="359" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="42"/>
              <col width="145"/>
              <col width="129"/>
              <thead>
                <tr valign="top">
                  <td>Topic no.</td>
                  <td>cancerdusein.org (n=16,868) <break/>n (%)</td>
                  <td>Facebook (n=70,092) <break/>n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>978 (5.80)</td>
                  <td>3294 (4.70)</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>590 (3.50)</td>
                  <td>3925 (5.60)</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>1147 (6.80)</td>
                  <td>3785 (5.40)</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>860 (5.10)</td>
                  <td>4065 (5.80)</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>1315 (7.80)</td>
                  <td>2804 (4.00)</td>
                </tr>
                <tr valign="top">
                  <td>6</td>
                  <td>759 (4.50)</td>
                  <td>3715 (5.30)</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>810 (4.80)</td>
                  <td>3014 (4.30)</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>523 (3.10)</td>
                  <td>3084 (4.40)</td>
                </tr>
                <tr valign="top">
                  <td>9</td>
                  <td>877 (5.20)</td>
                  <td>3645 (5.20)</td>
                </tr>
                <tr valign="top">
                  <td>10</td>
                  <td>692 (4.10)</td>
                  <td>3505 (5.00)</td>
                </tr>
                <tr valign="top">
                  <td>11</td>
                  <td>675 (4.00)</td>
                  <td>2804 (4.00)</td>
                </tr>
                <tr valign="top">
                  <td>12</td>
                  <td>523 (3.10)</td>
                  <td>2734 (3.90)</td>
                </tr>
                <tr valign="top">
                  <td>13</td>
                  <td>1113 (6.60)</td>
                  <td>5047 (7.20)</td>
                </tr>
                <tr valign="top">
                  <td>14</td>
                  <td>692 (4.10)</td>
                  <td>3014 (4.30)</td>
                </tr>
                <tr valign="top">
                  <td>15</td>
                  <td>843 (5.00)</td>
                  <td>2804 (4.00)</td>
                </tr>
                <tr valign="top">
                  <td>16</td>
                  <td>1063 (6.30)</td>
                  <td>2734 (3.90)</td>
                </tr>
                <tr valign="top">
                  <td>17</td>
                  <td>1248 (7.40)</td>
                  <td>3575 (5.10)</td>
                </tr>
                <tr valign="top">
                  <td>18</td>
                  <td>540 (3.20)</td>
                  <td>5607 (8.00)</td>
                </tr>
                <tr valign="top">
                  <td>19</td>
                  <td>1198 (7.10)</td>
                  <td>3432 (4.90)</td>
                </tr>
                <tr valign="top">
                  <td>20</td>
                  <td>422 (2.50)</td>
                  <td>3505 (5.00)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Relationships Between Self-Administered Questionnaires and Social Media</title>
        <p>We were able to match most of the topics from QoL self-administered questionnaires in social media. These topics correspond to a total of 95% (22/23) of topics in the cancerdusein.org corpus and 86% (20/23) of topics in the Facebook corpus. These figures underline the importance of studying QoL, because they correspond to patients’ real concerns. The topics that corresponded with those of the EORTC QLQ-C30 and the EORTC QLQ-BR23 questionnaires were hair loss, work life during cancer and financial aspects, chemotherapy and its secondary effects, breast reconstruction, support from the patient’s family and friends, treatment period, healing, diagnosis, breast cancer as a daily battle, body care and body image during cancer and sexuality, hormone therapy and its secondary effects, radiotherapy and its secondary effects, media and forum information exchange, everyday life during cancer, search for medical information, surgery, waiting for results of analysis, concerns, secondary effects of treatments, interaction with nurses and doctors, anxiety and fatigue, and relapse.</p>
      </sec>
      <sec>
        <title>Emerging Topics in Social Media</title>
        <p>We also found 5 topics that are not present in QoL questionnaires. These topics correspond to a total of 15% (3/20) of the cancerdusein.org corpus and 15% (3/20) of the Facebook corpus. Of the 5 topics that do not appear in the questionnaires, 2 focus on patients. The emerging topics are complementary and alternative medicine, mourning, family background and breast cancer, family members with breast cancer, and healing of a family member. Among these 5 topics, we believe that 2 of them (complementary and alternative medicine, and family background and breast cancer) could be added to the QoL questionnaires. The topic complementary and alternative medicine focuses on nonconventional treatments and corresponded to a total of 3.10% (523/16,868) of the cancerdusein.org corpus. The topic family background and breast cancer focuses on the relationships of patients with their family, especially healing and grieving for a family member. This topic corresponded to a total of 4.30% (3014/70,092) of the Facebook corpus. The 3 others topics are not related to QoL. These topics deal with mourning, having family members with breast cancer, and healing of a family member. They were discussed by relatives of patients and not by patients.</p>
      </sec>
      <sec>
        <title>Different Uses of Forums and Social Networks</title>
        <p>One of the reasons that led us to use 2 data resources (social networks and a health forum) was to discover the topics discussed in each platform. <xref ref-type="table" rid="table7">Table 7</xref> presents the relationships between topics found in both social media and the percentage distribution of messages in each topic. Of 20 topics detected by our model in the corpus forum and Facebook, we found 11 common topics in the 2 corpora. Some of them have a similar frequency of discussion (<xref ref-type="table" rid="table6">Table 6</xref>). These topics are hair loss, work life during cancer, support from patient’s family and friends, treatment period, diagnosis, and family members with breast cancer. We observed that topics such as chemotherapy and its secondary effects, breast reconstruction, and breast cancer as daily battle were discussed more on the forum than on Facebook, maybe because the subject is more technical. As <xref ref-type="table" rid="table7">Table 7</xref> shows, we noted that the topics support from a patient’s family and friends, body care and body image during cancer, and sexuality were discussed more on Facebook than on the forum because of visibility to friends. In the end, the topics discovered were quite similar. However, we observed a difference of length in the posts. Most of the time, posts from the health forum were longer than posts from Facebook. Even if the topics found in both social media were similar, messages from the forum provided more information and were better interpreted than messages from Facebook.</p>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Relationships between topics found on both social media (cancerdusein.org and Facebook) with <italic>K</italic>=20 in collaboration with an expert.</p>
          </caption>
          <table width="767" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="64"/>
            <col width="231"/>
            <col width="62"/>
            <col width="77"/>
            <col width="65"/>
            <col width="80"/>
            <col width="87"/>
            <thead>
              <tr valign="top">
                <td rowspan="2" colspan="2">Topic names</td>
                <td colspan="2">cancerdusein.org (n=16,868)</td>
                <td colspan="2">Facebook (n=70,092)</td>
                <td rowspan="2">Matched to questionnaire item</td>
              </tr>
              <tr valign="top">
                <td>Topic no.</td>
                <td>n (%)</td>
                <td>Topic no.</td>
                <td>n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="7"><bold>Topics on both social media</bold></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Hair loss</td>
                <td>1</td>
                <td>978 (5.80)</td>
                <td>4</td>
                <td>4065 (5.80)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Work life during cancer and financial aspects</td>
                <td>2</td>
                <td>590 (3.50)</td>
                <td>8</td>
                <td>3084 (4.40)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Chemotherapy and its secondary effects</td>
                <td>3</td>
                <td>1147 (6.80)</td>
                <td>2</td>
                <td>3925 (5.60)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Breast reconstruction</td>
                <td>5</td>
                <td>1315 (7.80)</td>
                <td>9</td>
                <td>3645 (5.20)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Support from patient’s family and friends</td>
                <td>6</td>
                <td>759 (4.50)</td>
                <td>10</td>
                <td>3505 (5.00)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
                <td>19</td>
                <td>3432 (4.90)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Family members with breast cancer</td>
                <td>10</td>
                <td>692 (4.10)</td>
                <td>17</td>
                <td>3575 (5.10)</td>
                <td>No</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Treatment period</td>
                <td>11</td>
                <td>675 (4.00)</td>
                <td>20</td>
                <td>3505 (5.00)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Healing</td>
                <td>13</td>
                <td>1113 (6.60)</td>
                <td>18</td>
                <td>5607 (8.00)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Diagnosis</td>
                <td>16</td>
                <td>1063 (6.30)</td>
                <td>1</td>
                <td>3294 (4.70)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Breast cancer as a daily battle</td>
                <td>17</td>
                <td>1248 (7.40)</td>
                <td>3</td>
                <td>3785 (5.40)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Body care and body image during cancer, and sexuality</td>
                <td>18</td>
                <td>540 (3.20)</td>
                <td>6</td>
                <td>3715 (5.30)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
                <td>15</td>
                <td>2804 (4.00)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
                <td>16</td>
                <td>2734 (3.90)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td colspan="7"><bold>Topics on only 1 social media</bold></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Hormone therapy and its secondary effects</td>
                <td>4</td>
                <td>860 (5.10)</td>
                <td>N/A<sup>a</sup></td>
                <td>N/A</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Radiotherapy and its secondary effects</td>
                <td>7</td>
                <td>810 (4.80)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Complementary and alternative medicine</td>
                <td>8</td>
                <td>523 (3.10)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>No</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Media and forum information exchange</td>
                <td>9</td>
                <td>877 (5.20)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Everyday life during cancer</td>
                <td>12</td>
                <td>523 (3.10)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Search for medical information</td>
                <td>14</td>
                <td>692 (4.10)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Mourning</td>
                <td>15</td>
                <td>843 (5.00)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>No</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Surgery</td>
                <td>19</td>
                <td>1198 (7.10)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Waiting for results of analysis, concerns</td>
                <td>20</td>
                <td>422 (2.50)</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Secondary effects of treatments</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>5</td>
                <td>2804 (4.00)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Family background and breast cancer</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>7</td>
                <td>3014 (4.30)</td>
                <td>No</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Interaction with nurses and doctors</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>11</td>
                <td>2804 (4.00)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Anxiety and fatigue</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>12</td>
                <td>2734 (3.90)</td>
                <td>Yes</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Healing of member family</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>13</td>
                <td>5047 (7.20)</td>
                <td>No</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Relapse</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>14</td>
                <td>3014 (4.30)</td>
                <td>Yes</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this work, we used an unsupervised learning model known as LDA to detect the different topics on a health forum and social network discussed by patients. We demonstrated how we used the LDA model on patient data with relevant preprocessing applied to 2 datasets obtained from a forum and Facebook messages. We used MeSH as the principal resource for medical terms and for patients’ and doctors’ vocabulary [<xref ref-type="bibr" rid="ref45">45</xref>]. We automatically detected relationships between topics and questions. We found good relationships between detected topics and the dimensions of internationally standardized questionnaires used for breast cancer patients, which substantiate the sound construction of such questionnaires. We detected new emerging topics from social media that could be used to complete actual QoL questionnaires. Moreover, we confirmed that social media can be an important source of information for the study of QoL in the field of cancer.</p>
        <p>In our ongoing work [<xref ref-type="bibr" rid="ref21">21</xref>], we are targeting the classification of whole messages or text snippets with respect to the role of the narrator (patient, confidant of a patient, expert, health professional) and to the location within the trajectory of care (before or after an operation, first cancer or relapse). One potential limitation of this work was the number of topics (<italic>K</italic>=20) selected for our LDA model. This limitation may be overcome by using the number of topics for which the model is better adjusted [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>], then, first, to merge topics that are close, and second, to find topics that could not be interpreted by humans and eliminate them. Moreover, the actual comparison of the 2 corpora (Facebook and forum) was done manually by the expert. A possibility is to adapt equation 1 (<xref ref-type="fig" rid="figure3">Figure 3</xref>) used to align LDA topics and questionnaire items in order to automatically compare topics extracted from the 2 corpora.</p>
        <p>Of course, the lack of informed consent given by social media users for data usage leads to ethical questions. In particular, confidentiality with respect to the publication of research results is an issue (see others’ discussion and guidelines [<xref ref-type="bibr" rid="ref66">66</xref>-<xref ref-type="bibr" rid="ref68">68</xref>]). We adhered to those guidelines. We have presented results with a degree of detail that does not permit conclusions on individual users to be drawn. In the long term, we will study emotions described by patients in their messages for each topic and make some statistical analyses. Finally, we will use the emotion classification system built by Abdaoui et al [<xref ref-type="bibr" rid="ref69">69</xref>] to detect polarity (positive, negative, or neutral), subjectivity (objective, subjective), and feelings (joy, surprise, anger, fear, etc) of users’ messages, and we will relate this information to the detected topics in order to determine patients’ perception of their disease. What are the topics that frighten patients the most and that need prevention?</p>
      </sec>
    </sec>
  </body>
  <back>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">EORTC</term>
          <def>
            <p>European Organization for Research and Treatment of Cancer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">INCA</term>
          <def>
            <p>Institut National du Cancer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LDA</term>
          <def>
            <p>latent Dirichlet allocation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MeSH</term>
          <def>
            <p>Medical Subject Headings</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">QLQ-C30</term>
          <def>
            <p>Quality of Life Questionnaire Core 30</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">QoL</term>
          <def>
            <p>quality of life</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The 5 authors are justifiably credited with authorship, according to the authorship criteria. In particular: MDTN: acquisition of data, analysis and interpretation of data, drafting of the methods, final approval; TO: conception, analysis and interpretation of data, drafting of the manuscript, final approval; CL: conception, analysis, drafting of the manuscript, final approval; SB: acquisition of data, critical revision of manuscript, final approval; CM: interpretation of data, critical revision of manuscript, final approval. This work was supported by the ANR SIFR (Semantic Indexing of French Biomedical Data Resources) and by a grant from the French Public Health Research Institute (http://www.iresp.net) under the 2012 call for projects as part of the 2009-2013 Cancer Plan.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Robinson</surname>
            <given-names>KM</given-names>
          </name>
        </person-group>
        <article-title>Unsolicited narratives from the Internet: a rich source of qualitative data</article-title>
        <source>Qual Health Res</source>  
        <year>2001</year>  
        <month>09</month>  
        <volume>11</volume>  
        <issue>5</issue>  
        <fpage>706</fpage>  
        <lpage>714</lpage>  
        <pub-id pub-id-type="doi">10.1177/104973201129119398</pub-id>
        <pub-id pub-id-type="medline">11554197</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Seale</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Charteris-Black</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>MacFarlane</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>McPherson</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Interviews and internet forums: a comparison of two sources of qualitative data</article-title>
        <source>Qual Health Res</source>  
        <year>2010</year>  
        <month>05</month>  
        <volume>20</volume>  
        <issue>5</issue>  
        <fpage>595</fpage>  
        <lpage>606</lpage>  
        <pub-id pub-id-type="doi">10.1177/1049732309354094</pub-id>
        <pub-id pub-id-type="medline">20008955</pub-id>
        <pub-id pub-id-type="pii">1049732309354094</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hartzler</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Pratt</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Managing the personal side of health: how patient expertise differs from the expertise of clinicians</article-title>
        <source>J Med Internet Res</source>  
        <year>2011</year>  
        <volume>13</volume>  
        <issue>3</issue>  
        <fpage>e62</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2011/3/e62/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.1728</pub-id>
        <pub-id pub-id-type="medline">21846635</pub-id>
        <pub-id pub-id-type="pii">v13i3e62</pub-id>
        <pub-id pub-id-type="pmcid">PMC3222167</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hancock</surname>
            <given-names>JT</given-names>
          </name>
          <name name-style="western">
            <surname>Toma</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Ellison</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>The truth about lying in online dating profiles</article-title>
        <year>2007</year>  
        <conf-name>The SIGCHI conference on Human factors in computing systems of the ACM</conf-name>
        <conf-date>Apr 28–May 3, 2007</conf-date>
        <conf-loc>San Jose, CA, USA</conf-loc>
        <fpage>449</fpage>  
        <lpage>452</lpage> </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lemire</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Paré</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Sicotte</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Harvey</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Determinants of Internet use as a preferred source of information on personal health</article-title>
        <source>Int J Med Inform</source>  
        <year>2008</year>  
        <month>11</month>  
        <volume>77</volume>  
        <issue>11</issue>  
        <fpage>723</fpage>  
        <lpage>34</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2008.03.002</pub-id>
        <pub-id pub-id-type="medline">18434246</pub-id>
        <pub-id pub-id-type="pii">S1386-5056(08)00039-7</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ybarra</surname>
            <given-names>ML</given-names>
          </name>
          <name name-style="western">
            <surname>Suman</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Help seeking behavior and the Internet: a national survey</article-title>
        <source>Int J Med Inform</source>  
        <year>2006</year>  
        <month>01</month>  
        <volume>75</volume>  
        <issue>1</issue>  
        <fpage>29</fpage>  
        <lpage>41</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2005.07.029</pub-id>
        <pub-id pub-id-type="medline">16129659</pub-id>
        <pub-id pub-id-type="pii">S1386-5056(05)00147-4</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Rice</surname>
            <given-names>RE</given-names>
          </name>
        </person-group>
        <article-title>Influences, usage, and outcomes of Internet health information searching: multivariate results from the Pew surveys</article-title>
        <source>Int J Med Inform</source>  
        <year>2006</year>  
        <month>01</month>  
        <volume>75</volume>  
        <issue>1</issue>  
        <fpage>8</fpage>  
        <lpage>28</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2005.07.032</pub-id>
        <pub-id pub-id-type="medline">16125453</pub-id>
        <pub-id pub-id-type="pii">S1386-5056(05)00146-2</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ganz</surname>
            <given-names>PA</given-names>
          </name>
          <name name-style="western">
            <surname>Rowland</surname>
            <given-names>JH</given-names>
          </name>
          <name name-style="western">
            <surname>Desmond</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Meyerowitz</surname>
            <given-names>BE</given-names>
          </name>
          <name name-style="western">
            <surname>Wyatt</surname>
            <given-names>GE</given-names>
          </name>
        </person-group>
        <article-title>Life after breast cancer: understanding women's health-related quality of life and sexual functioning</article-title>
        <source>J Clin Oncol</source>  
        <year>1998</year>  
        <month>02</month>  
        <volume>16</volume>  
        <issue>2</issue>  
        <fpage>501</fpage>  
        <lpage>14</lpage>  
        <pub-id pub-id-type="doi">10.1200/JCO.1998.16.2.501</pub-id>
        <pub-id pub-id-type="medline">9469334</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>King</surname>
            <given-names>MT</given-names>
          </name>
          <name name-style="western">
            <surname>Kenny</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Shiell</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Hall</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Boyages</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Quality of life three months and one year after first treatment for early stage breast cancer: influence of treatment and patient characteristics</article-title>
        <source>Qual Life Res</source>  
        <year>2000</year>  
        <volume>9</volume>  
        <issue>7</issue>  
        <fpage>789</fpage>  
        <lpage>800</lpage>  
        <pub-id pub-id-type="medline">11297021</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lidgren</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Wilking</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Jönsson</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Rehnberg</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Health related quality of life in different states of breast cancer</article-title>
        <source>Qual Life Res</source>  
        <year>2007</year>  
        <month>08</month>  
        <volume>16</volume>  
        <issue>6</issue>  
        <fpage>1073</fpage>  
        <lpage>1081</lpage>  
        <pub-id pub-id-type="doi">10.1007/s11136-007-9202-8</pub-id>
        <pub-id pub-id-type="medline">17468943</pub-id></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Montazeri</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Health-related quality of life in breast cancer patients: a bibliographic review of the literature from 1974 to 2007</article-title>
        <source>J Exp Clin Cancer Res</source>  
        <year>2008</year>  
        <volume>27</volume>  
        <issue>1</issue>  
        <fpage>32</fpage>  
        <pub-id pub-id-type="doi">10.1186/1756-9966-27-32</pub-id>
        <pub-id pub-id-type="medline">18759983</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Doward</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>McKenna</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Defining patient-reported outcomes</article-title>
        <source>Value Health</source>  
        <year>2004</year>  
        <volume>7 Suppl 1</volume>  
        <fpage>S4</fpage>  
        <lpage>8</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1098-3015(10)60228-8"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1111/j.1524-4733.2004.7s102.x</pub-id>
        <pub-id pub-id-type="medline">15367236</pub-id>
        <pub-id pub-id-type="pii">S1098-3015(10)60228-8</pub-id></nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fayers</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Machin</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <source>Quality of Life: The Assessment, Analysis and Interpretation of Patient-Reported Outcomes. 2nd edition</source>  
        <year>2013</year>  
        <publisher-loc>New York, NY</publisher-loc>
        <publisher-name>John Wiley &#38; Sons</publisher-name></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bausewein</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Hartenstein</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Oncology and palliative care</article-title>
        <source>Oncol Res Treat</source>  
        <year>2001</year>  
        <month>1</month>  
        <day>15</day>  
        <volume>23</volume>  
        <issue>6</issue>  
        <fpage>534</fpage>  
        <lpage>537</lpage>  
        <pub-id pub-id-type="doi">10.1159/000055002</pub-id>
        <pub-id pub-id-type="medline">11441257</pub-id></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bausewein</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Higginson</surname>
            <given-names>IJ</given-names>
          </name>
        </person-group>
        <article-title>Appropriate methods to assess the effectiveness and efficacy of treatments or interventions to control cancer pain</article-title>
        <source>J Palliat Med</source>  
        <year>2004</year>  
        <month>06</month>  
        <volume>7</volume>  
        <issue>3</issue>  
        <fpage>423</fpage>  
        <lpage>430</lpage>  
        <pub-id pub-id-type="doi">10.1089/1096621041349572</pub-id>
        <pub-id pub-id-type="medline">15265352</pub-id></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hirth</surname>
            <given-names>RA</given-names>
          </name>
          <name name-style="western">
            <surname>Chernew</surname>
            <given-names>ME</given-names>
          </name>
          <name name-style="western">
            <surname>Miller</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Fendrick</surname>
            <given-names>AM</given-names>
          </name>
          <name name-style="western">
            <surname>Weissert</surname>
            <given-names>WG</given-names>
          </name>
        </person-group>
        <article-title>Willingness to pay for a quality-adjusted life year: in search of a standard</article-title>
        <source>Med Decis Making</source>  
        <year>2000</year>  
        <volume>20</volume>  
        <issue>3</issue>  
        <fpage>332</fpage>  
        <lpage>342</lpage>  
        <pub-id pub-id-type="doi">10.1177/0272989X0002000310</pub-id>
        <pub-id pub-id-type="medline">10929856</pub-id></nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cutler</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>McClellan</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Is technological change in medicine worth it?</article-title>
        <source>Health Aff (Millwood)</source>  
        <year>2001</year>  
        <volume>20</volume>  
        <issue>5</issue>  
        <fpage>11</fpage>  
        <lpage>29</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://content.healthaffairs.org/cgi/pmidlookup?view=long&#38;pmid=11558696"/>
        </comment>  
        <pub-id pub-id-type="medline">11558696</pub-id></nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hillner</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>TJ</given-names>
          </name>
        </person-group>
        <article-title>Efficacy and cost effectiveness of adjuvant chemotherapy in women with node-negative breast cancer. A decision-analysis model</article-title>
        <source>N Engl J Med</source>  
        <year>1991</year>  
        <month>01</month>  
        <day>17</day>  
        <volume>324</volume>  
        <issue>3</issue>  
        <fpage>160</fpage>  
        <lpage>168</lpage>  
        <pub-id pub-id-type="doi">10.1056/NEJM199101173240305</pub-id>
        <pub-id pub-id-type="medline">1898533</pub-id></nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Garratt</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Schmidt</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Mackintosh</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Fitzpatrick</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Quality of life measurement: bibliographic study of patient assessed health outcome measures</article-title>
        <source>BMJ</source>  
        <year>2002</year>  
        <month>06</month>  
        <day>15</day>  
        <volume>324</volume>  
        <issue>7351</issue>  
        <fpage>1417</fpage>  
        <lpage>1421</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/12065262"/>
        </comment>  
        <pub-id pub-id-type="medline">12065262</pub-id>
        <pub-id pub-id-type="pmcid">PMC115850</pub-id></nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Aaronson</surname>
            <given-names>NK</given-names>
          </name>
          <name name-style="western">
            <surname>Ahmedzai</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Bergman</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Bullinger</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Cull</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Duez</surname>
            <given-names>NJ</given-names>
          </name>
          <name name-style="western">
            <surname>Filiberti</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Flechtner</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Fleishman</surname>
            <given-names>SB</given-names>
          </name>
          <name name-style="western">
            <surname>de</surname>
            <given-names>HJC</given-names>
          </name>
        </person-group>
        <article-title>The European Organization for Research and Treatment of Cancer QLQ-C30: a quality-of-life instrument for use in international clinical trials in oncology</article-title>
        <source>J Natl Cancer Inst</source>  
        <year>1993</year>  
        <month>03</month>  
        <day>3</day>  
        <volume>85</volume>  
        <issue>5</issue>  
        <fpage>365</fpage>  
        <lpage>76</lpage>  
        <pub-id pub-id-type="medline">8433390</pub-id></nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Opitz</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Bringay</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Azé</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Joutard</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Lavergne</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Mollevi</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Breast cancer and quality of life: medical information extraction from health forums</article-title>
        <year>2014</year>  
        <conf-name>Medical Informatics Europe</conf-name>
        <conf-date>Aug 31–Sept 3, 2014</conf-date>
        <conf-loc>Istanbul, Turkey</conf-loc>
        <fpage>1070</fpage>  
        <lpage>1074</lpage> </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Arnold</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Speier</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>A topic model of clinical reports</article-title>
        <year>2012</year>  
        <conf-name>The 35th international ACM SIGIR conference on Research development in information retrieval</conf-name>
        <conf-date>Aug 12-16, 2012</conf-date>
        <conf-loc>Portland, OR, USA</conf-loc>
        <fpage>12</fpage>  
        <lpage>16</lpage> </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lu</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Deng</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Health-related hot topic detection in online communities using text clustering</article-title>
        <source>PLoS One</source>  
        <year>2013</year>  
        <month>01</month>  
        <volume>8</volume>  
        <issue>2</issue>  
        <fpage>e56221</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0056221"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0056221</pub-id>
        <pub-id pub-id-type="medline">23457530</pub-id>
        <pub-id pub-id-type="pii">PONE-D-12-27785</pub-id>
        <pub-id pub-id-type="pmcid">PMC3574139</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Grave</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Sklar</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Elhadad</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <source>Longitudinal analysis of discussion topics in an online breast cancer community using convolutional neural networks</source>  
        <year>2016</year>  
        <month>03</month>  
        <access-date>2017-07-18</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://adsabs.harvard.edu/abs/2016arXiv160308458Z">http://adsabs.harvard.edu/abs/2016arXiv160308458Z</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6s3SA3w5B"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Landauer</surname>
            <given-names>TK</given-names>
          </name>
          <name name-style="western">
            <surname>Dumais</surname>
            <given-names>ST</given-names>
          </name>
        </person-group>
        <article-title>A solution to Plato's problem: The latent semantic analysis theory of acquisition, induction, and representation of knowledge</article-title>
        <source>Psychol Rev</source>  
        <year>1997</year>  
        <volume>104</volume>  
        <issue>2</issue>  
        <fpage>211</fpage>  
        <lpage>240</lpage>  
        <pub-id pub-id-type="doi">10.1037/0033-295X.104.2.211</pub-id></nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hofmann</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Unsupervised learning by probabilistic latent semantic analysis</article-title>
        <source>Mach Learn</source>  
        <year>2001</year>  
        <volume>42</volume>  
        <issue>1</issue>  
        <fpage>177</fpage>  
        <lpage>196</lpage>  
        <pub-id pub-id-type="doi">10.1023/A:1007617005950</pub-id></nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Ng</surname>
            <given-names>AY</given-names>
          </name>
          <name name-style="western">
            <surname>Jordan</surname>
            <given-names>MI</given-names>
          </name>
        </person-group>
        <article-title>Latent Dirichlet allocation</article-title>
        <source>J Mach Learn Res</source>  
        <year>2003</year>  
        <volume>3</volume>  
        <fpage>993</fpage>  
        <lpage>1022</lpage>  
        <pub-id pub-id-type="doi">10.1162/jmlr.2003.3.4-5.993</pub-id></nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Deerwester</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Dumais</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Furnas</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Landauer</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Harshman</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Indexing by latent semantic analysis</article-title>
        <source>J Am Soc Inf Sci</source>  
        <year>1990</year>  
        <month>09</month>  
        <volume>41</volume>  
        <issue>6</issue>  
        <fpage>391</fpage>  
        <lpage>407</lpage>  
        <pub-id pub-id-type="doi">10.1002/(SICI)1097-4571(199009)41:6&#60;391::AID-ASI1&#62;3.0.CO;2-9</pub-id></nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhan</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Leischow</surname>
            <given-names>SJ</given-names>
          </name>
          <name name-style="western">
            <surname>Zeng</surname>
            <given-names>DD</given-names>
          </name>
        </person-group>
        <article-title>Identifying topics for e-cigarette user-generated contents: a case study from multiple social media platforms</article-title>
        <source>J Med Internet Res</source>  
        <year>2017</year>  
        <month>01</month>  
        <day>20</day>  
        <volume>19</volume>  
        <issue>1</issue>  
        <fpage>e24</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2017/1/e24/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.5780</pub-id>
        <pub-id pub-id-type="medline">28108428</pub-id>
        <pub-id pub-id-type="pii">v19i1e24</pub-id>
        <pub-id pub-id-type="pmcid">PMC5291865</pub-id></nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Ding</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Tang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Dong</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Qiu</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Wild</surname>
            <given-names>DJ</given-names>
          </name>
        </person-group>
        <article-title>Finding complex biological relationships in recent PubMed articles using Bio-LDA</article-title>
        <source>PLoS One</source>  
        <year>2011</year>  
        <month>03</month>  
        <day>23</day>  
        <volume>6</volume>  
        <issue>3</issue>  
        <fpage>e17243</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0017243"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0017243</pub-id>
        <pub-id pub-id-type="medline">21448266</pub-id>
        <pub-id pub-id-type="pmcid">PMC3063155</pub-id></nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Discovering health topics in social media using topic models</article-title>
        <source>PLoS One</source>  
        <year>2014</year>  
        <volume>9</volume>  
        <issue>8</issue>  
        <fpage>e103408</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0103408"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0103408</pub-id>
        <pub-id pub-id-type="medline">25084530</pub-id>
        <pub-id pub-id-type="pii">PONE-D-14-00554</pub-id>
        <pub-id pub-id-type="pmcid">PMC4118877</pub-id></nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Portier</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Greer</surname>
            <given-names>GE</given-names>
          </name>
          <name name-style="western">
            <surname>Rokach</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Ofek</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Biyani</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Banerjee</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Zhao</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Mitra</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Yen</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Understanding topics and sentiment in an online cancer survivor community</article-title>
        <source>J Natl Cancer Inst Monogr</source>  
        <year>2013</year>  
        <month>12</month>  
        <volume>2013</volume>  
        <issue>47</issue>  
        <fpage>195</fpage>  
        <lpage>8</lpage>  
        <pub-id pub-id-type="doi">10.1093/jncimonographs/lgt025</pub-id>
        <pub-id pub-id-type="medline">24395991</pub-id>
        <pub-id pub-id-type="pii">lgt025</pub-id></nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Attard</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Coulson</surname>
            <given-names>NS</given-names>
          </name>
        </person-group>
        <article-title>A thematic analysis of patient communication in Parkinson’s disease online support group discussion forums</article-title>
        <source>Comput Hum Behav</source>  
        <year>2012</year>  
        <month>3</month>  
        <volume>28</volume>  
        <issue>2</issue>  
        <fpage>500</fpage>  
        <lpage>506</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.chb.2011.10.022</pub-id></nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Selby</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>van</surname>
            <given-names>MT</given-names>
          </name>
          <name name-style="western">
            <surname>Voci</surname>
            <given-names>SC</given-names>
          </name>
          <name name-style="western">
            <surname>Parent</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Cunningham</surname>
            <given-names>JA</given-names>
          </name>
        </person-group>
        <article-title>Online social and professional support for smokers trying to quit: an exploration of first time posts from 2562 members</article-title>
        <source>J Med Internet Res</source>  
        <year>2010</year>  
        <volume>12</volume>  
        <issue>3</issue>  
        <fpage>e34</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2010/3/e34/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.1340</pub-id>
        <pub-id pub-id-type="medline">20719739</pub-id>
        <pub-id pub-id-type="pii">v12i3e34</pub-id>
        <pub-id pub-id-type="pmcid">PMC2956324</pub-id></nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Himmel</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Reincke</surname>
            <given-names>U</given-names>
          </name>
          <name name-style="western">
            <surname>Michelmann</surname>
            <given-names>HW</given-names>
          </name>
        </person-group>
        <article-title>Text mining and natural language processing approaches for automatic categorization of lay requests to web-based expert forums</article-title>
        <source>J Med Internet Res</source>  
        <year>2009</year>  
        <month>07</month>  
        <day>22</day>  
        <volume>11</volume>  
        <issue>3</issue>  
        <fpage>e25</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2009/3/e25/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.1123</pub-id>
        <pub-id pub-id-type="medline">19632978</pub-id>
        <pub-id pub-id-type="pii">v11i3e25</pub-id>
        <pub-id pub-id-type="pmcid">PMC2762848</pub-id></nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Huh</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Yetisgen-Yildiz</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Pratt</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Text classification for assisting moderators in online health communities</article-title>
        <source>J Biomed Inform</source>  
        <year>2013</year>  
        <month>12</month>  
        <volume>46</volume>  
        <issue>6</issue>  
        <fpage>998</fpage>  
        <lpage>1005</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.sciencedirect.com/science/article/pii/S1532046413001391"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.jbi.2013.08.011</pub-id>
        <pub-id pub-id-type="medline">24025513</pub-id>
        <pub-id pub-id-type="pii">S1532-0464(13)00139-1</pub-id>
        <pub-id pub-id-type="pmcid">PMC3874858</pub-id></nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hao</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>The voice of Chinese health consumers: a text mining approach to web-based physician reviews</article-title>
        <source>J Med Internet Res</source>  
        <year>2016</year>  
        <month>05</month>  
        <day>10</day>  
        <volume>18</volume>  
        <issue>5</issue>  
        <fpage>e108</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2016/5/e108/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.4430</pub-id>
        <pub-id pub-id-type="medline">27165558</pub-id>
        <pub-id pub-id-type="pii">v18i5e108</pub-id>
        <pub-id pub-id-type="pmcid">PMC4879326</pub-id></nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hao</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Gao</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>A tale of two countries: international comparison of online doctor reviews between China and the United States</article-title>
        <source>Int J Med Inform</source>  
        <year>2017</year>  
        <month>03</month>  
        <volume>99</volume>  
        <fpage>37</fpage>  
        <lpage>44</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2016.12.007</pub-id>
        <pub-id pub-id-type="medline">28118920</pub-id>
        <pub-id pub-id-type="pii">S1386-5056(16)30275-1</pub-id></nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Yesha</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Gangopadhyay</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>A method for analyzing health behavior in online forums</article-title>
        <year>2015</year>  
        <conf-name>The 6th ACM Conference on Bioinformatics, Computational Biology and Health Informatics</conf-name>
        <conf-date>Sept 9-12, 2015</conf-date>
        <conf-loc>Atlanta, GA, USA</conf-loc>
        <fpage>615</fpage>  
        <lpage>621</lpage> </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>YC</given-names>
          </name>
          <name name-style="western">
            <surname>Kraut</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Levine</surname>
            <given-names>JM</given-names>
          </name>
        </person-group>
        <article-title>To stay or leave?: the relationship of emotional and informational support to commitment in online health support groups</article-title>
        <year>2012</year>  
        <conf-name>Conference on Computer Supported Cooperative Work</conf-name>
        <conf-date>Feb 11-15, 2012</conf-date>
        <conf-loc>Seattle, WA, USA</conf-loc>
        <fpage>833</fpage>  
        <lpage>842</lpage> </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Exploring health topics in Chinese social media: an analysis of Sina Weibo</article-title>
        <year>2014</year>  
        <conf-name>The AAAI Workshop on the World Wide Web and Public Health Intelligence</conf-name>
        <conf-date>July 27-28 , 2014</conf-date>
        <conf-loc>Quebec, QC, Canada</conf-loc>
        <fpage>20</fpage>  
        <lpage>23</lpage> </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Balahur</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Sentiment analysis in social media texts</article-title>
        <year>2013</year>  
        <conf-name>The 4th workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis</conf-name>
        <conf-date>June 14, 2013</conf-date>
        <conf-loc>Atlanta, GA, USA</conf-loc>
        <fpage>120</fpage>  
        <lpage>128</lpage> </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Farzindar</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Inkpen</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Natural language processing for social media</article-title>
        <source>Synthesis Lectures on Human Language Technologies</source>  
        <year>2015</year>  
        <month>08</month>  
        <day>28</day>  
        <volume>8</volume>  
        <issue>2</issue>  
        <fpage>1</fpage>  
        <lpage>166</lpage>  
        <pub-id pub-id-type="doi">10.2200/S00659ED1V01Y201508HLT030</pub-id></nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Schmid</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Probabilistic part-of-speech tagging using decision trees</article-title>
        <year>1994</year>  
        <conf-name>International Conference on New Methods in Language Processing</conf-name>
        <conf-date>Sept 14-16, 1994</conf-date>
        <conf-loc>Manchester, UK</conf-loc>
        <fpage>14</fpage>  
        <lpage>16</lpage> </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Tapi Nzali</surname>
            <given-names>MD</given-names>
          </name>
          <name name-style="western">
            <surname>Bringay</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Lavergne</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Opitz</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Azé</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Mollevi</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Construction d’un vocabulaire patient/médecin dédié au cancer du sein à partir des médias sociaux</article-title>
        <year>2015</year>  
        <conf-name>The 26th Conference on Ingénierie des Connaissances</conf-name>
        <conf-date>July 1-3, 2015</conf-date>
        <conf-loc>Rennes, France</conf-loc>
        <fpage>9</fpage>  
        <lpage>20</lpage> </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Asuncion</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Welling</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Smyth</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Teh</surname>
            <given-names>YW</given-names>
          </name>
        </person-group>
        <article-title>On smoothing and inference for topic models</article-title>
        <year>2009</year>  
        <conf-name>The 25th Conference on Uncertainty in Artificial Intelligence</conf-name>
        <conf-date>June 18-21, 2009</conf-date>
        <conf-loc>Montreal, QC, Canada</conf-loc>
        <fpage>27</fpage>  
        <lpage>34</lpage> </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Griffiths</surname>
            <given-names>TL</given-names>
          </name>
          <name name-style="western">
            <surname>Steyvers</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Finding scientific topics</article-title>
        <source>Proc Natl Acad Sci U S A</source>  
        <year>2004</year>  
        <month>04</month>  
        <day>06</day>  
        <volume>101 Suppl 1</volume>  
        <fpage>5228</fpage>  
        <lpage>35</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pnas.org/cgi/pmidlookup?view=long&#38;pmid=14872004"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1073/pnas.0307752101</pub-id>
        <pub-id pub-id-type="medline">14872004</pub-id>
        <pub-id pub-id-type="pii">0307752101</pub-id>
        <pub-id pub-id-type="pmcid">PMC387300</pub-id></nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Gerrish</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Boyd-Graber</surname>
            <given-names>JL</given-names>
          </name>
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>DM</given-names>
          </name>
        </person-group>
        <article-title>Reading tea leaves: how humans interpret topic models</article-title>
        <year>2009</year>  
        <conf-name>The 22nd International Conference on Neural Information Processing Systems</conf-name>
        <conf-date>Dec 7-10, 2009</conf-date>
        <conf-loc>Vancouver, BC, Canada</conf-loc>
        <fpage>288</fpage>  
        <lpage>296</lpage> </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wallach</surname>
            <given-names>HM</given-names>
          </name>
          <name name-style="western">
            <surname>Murray</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Salakhutdinov</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Mimno</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Evaluation methods for topic models</article-title>
        <year>2009</year>  
        <conf-name>Annual International Conference on Machine Learning ACM</conf-name>
        <conf-date>June 14-18, 2009</conf-date>
        <conf-loc>Montreal, QC, Canada</conf-loc>
        <fpage>1105</fpage>  
        <lpage>1112</lpage> </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Thirion</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Pereira</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Névéol</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Dahamna</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Darmoni</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>French MeSH Browser: a cross-language tool to access MEDLINE/PubMed</article-title>
        <year>2006</year>  
        <conf-name>Annual Symposium Proceedings/AMIA Symposium</conf-name>
        <conf-date>Nov 11-15, 2006</conf-date>
        <conf-loc>Washington, DC, USA</conf-loc>
        <fpage>1132</fpage> </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Jaccard</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <source>Etude Comparative de la Distribution Florale dans une Portion des Alpes et du Jura</source>  
        <year>1901</year>  
        <publisher-loc>Zurich, Switzerland</publisher-loc>
        <publisher-name>Imprimerie Corbaz</publisher-name></nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Grün</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Hornik</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>topicmodels: an R package for fitting topic models</article-title>
        <source>J Stat Softw</source>  
        <year>2011</year>  
        <volume>40</volume>  
        <issue>13</issue>  
        <pub-id pub-id-type="doi">10.18637/jss.v040.i13</pub-id></nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Anota</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Barbieri</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Savina</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Pam</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Gourgou-Bourgade</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Bonnetain</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Bascoul-Mollevi</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Comparison of three longitudinal analysis models for the health-related quality of life in oncology: a simulation study</article-title>
        <source>Health Qual Life Outcomes</source>  
        <year>2014</year>  
        <month>12</month>  
        <day>31</day>  
        <volume>12</volume>  
        <fpage>192</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://hqlo.biomedcentral.com/articles/10.1186/s12955-014-0192-2"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/s12955-014-0192-2</pub-id>
        <pub-id pub-id-type="medline">25551580</pub-id>
        <pub-id pub-id-type="pii">s12955-014-0192-2</pub-id>
        <pub-id pub-id-type="pmcid">PMC4326524</pub-id></nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Barbieri</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Anota</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Conroy</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Gourgou-Bourgade</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Juzyna</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Bonnetain</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Lavergne</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Bascoul-Mollevi</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Applying the longitudinal model from item response theory to assess health-related quality of life in the PRODIGE 4/ACCORD 11 randomized trial</article-title>
        <source>Med Decis Making</source>  
        <year>2016</year>  
        <month>07</month>  
        <volume>36</volume>  
        <issue>5</issue>  
        <fpage>615</fpage>  
        <lpage>28</lpage>  
        <pub-id pub-id-type="doi">10.1177/0272989X15621883</pub-id>
        <pub-id pub-id-type="medline">26683246</pub-id>
        <pub-id pub-id-type="pii">0272989X15621883</pub-id></nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sprangers</surname>
            <given-names>MA</given-names>
          </name>
          <name name-style="western">
            <surname>Groenvold</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Arraras</surname>
            <given-names>JI</given-names>
          </name>
          <name name-style="western">
            <surname>Franklin</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>te Velde</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Muller</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Franzini</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Williams</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>de Haes</surname>
            <given-names>HC</given-names>
          </name>
          <name name-style="western">
            <surname>Hopwood</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Cull</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Aaronson</surname>
            <given-names>NK</given-names>
          </name>
        </person-group>
        <article-title>The European Organization for Research and Treatment of Cancer breast cancer-specific quality-of-life questionnaire module: first results from a three-country field study</article-title>
        <source>J Clin Oncol</source>  
        <year>1996</year>  
        <month>10</month>  
        <volume>14</volume>  
        <issue>10</issue>  
        <fpage>2756</fpage>  
        <lpage>68</lpage>  
        <pub-id pub-id-type="doi">10.1200/JCO.1996.14.10.2756</pub-id>
        <pub-id pub-id-type="medline">8874337</pub-id></nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zeng</surname>
            <given-names>QT</given-names>
          </name>
          <name name-style="western">
            <surname>Tse</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Exploring and developing consumer health vocabularies</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2006</year>  
        <volume>13</volume>  
        <issue>1</issue>  
        <fpage>24</fpage>  
        <lpage>9</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jamia.oxfordjournals.org/lookup/pmidlookup?view=long&#38;pmid=16221948"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1197/jamia.M1761</pub-id>
        <pub-id pub-id-type="medline">16221948</pub-id>
        <pub-id pub-id-type="pii">M1761</pub-id>
        <pub-id pub-id-type="pmcid">PMC1380193</pub-id></nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Abdaoui</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Azé</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bringay</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Grabar</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Poncelet</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Analysis of forum posts written by patients and health professionals</article-title>
        <year>2014</year>  
        <conf-name>Medical Informatics Europe</conf-name>
        <conf-date>Aug 31–Sept 3, 2014</conf-date>
        <conf-loc>Istanbul, Turkey</conf-loc>
        <fpage>1185</fpage> </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Taphoorn</surname>
            <given-names>MJB</given-names>
          </name>
          <name name-style="western">
            <surname>Claassens</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Aaronson</surname>
            <given-names>NK</given-names>
          </name>
          <name name-style="western">
            <surname>Coens</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Mauer</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Osoba</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Stupp</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Mirimanoff</surname>
            <given-names>RO</given-names>
          </name>
          <name name-style="western">
            <surname>van den Bent</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Bottomley</surname>
            <given-names>A</given-names>
          </name>
          <collab>EORTC Quality of Life Group‚ Brain Cancer‚ NCIC and Radiotherapy Groups</collab>
        </person-group>
        <article-title>An international validation study of the EORTC brain cancer module (EORTC QLQ-BN20) for assessing health-related quality of life and symptoms in brain cancer patients</article-title>
        <source>Eur J Cancer</source>  
        <year>2010</year>  
        <month>04</month>  
        <volume>46</volume>  
        <issue>6</issue>  
        <fpage>1033</fpage>  
        <lpage>40</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.ejca.2010.01.012</pub-id>
        <pub-id pub-id-type="medline">20181476</pub-id>
        <pub-id pub-id-type="pii">S0959-8049(10)00031-6</pub-id></nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bergman</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Aaronson</surname>
            <given-names>NK</given-names>
          </name>
          <name name-style="western">
            <surname>Ahmedzai</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Kaasa</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Sullivan</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>The EORTC QLQ-LC13: a modular supplement to the EORTC Core Quality of Life Questionnaire (QLQ-C30) for use in lung cancer clinical trials. EORTC Study Group on Quality of Life</article-title>
        <source>Eur J Cancer</source>  
        <year>1994</year>  
        <volume>30A</volume>  
        <issue>5</issue>  
        <fpage>635</fpage>  
        <lpage>42</lpage>  
        <pub-id pub-id-type="medline">8080679</pub-id></nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Abboute</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Boudjeriou</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Entringer</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Azé</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bringay</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Poncelet</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Mining Twitter for suicide prevention</article-title>
        <year>2014</year>  
        <conf-name>International Conference on Applications of Natural Language to Data Bases/Information Systems</conf-name>
        <conf-date>June 18–20, 2014</conf-date>
        <conf-loc>Montpellier, France</conf-loc>
        <fpage>250</fpage>  
        <lpage>253</lpage> </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Arun</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Suresh</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Madhavan</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Murthy</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>On finding the natural number of topics with latent Dirichlet allocation: some observations</article-title>
        <year>2010</year>  
        <conf-name>Pacific-Asia Conference on Knowledge Discovery and Data Mining</conf-name>
        <conf-date>June 21–24, 2010</conf-date>
        <conf-loc>Hyderabad, India</conf-loc>
        <fpage>391</fpage>  
        <lpage>402</lpage> </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cao</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Xia</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Tang</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>A density-based method for adaptive LDA model selection</article-title>
        <source>Neurocomputing</source>  
        <year>2009</year>  
        <month>3</month>  
        <volume>72</volume>  
        <issue>7</issue>  
        <fpage>1775</fpage>  
        <lpage>1781</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.neucom.2008.06.011</pub-id></nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhao</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>JJ</given-names>
          </name>
          <name name-style="western">
            <surname>Perkins</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Z</given-names>
          </name>
          <name name-style="western">
            <surname>Ge</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Ding</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Zou</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>A heuristic approach to determine an appropriate number of topics in topic modeling</article-title>
        <source>BMC Bioinformatics</source>  
        <year>2015</year>  
        <volume>16 Suppl 13</volume>  
        <fpage>S8</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-16-S13-S8"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/1471-2105-16-S13-S8</pub-id>
        <pub-id pub-id-type="medline">26424364</pub-id>
        <pub-id pub-id-type="pii">1471-2105-16-S13-S8</pub-id>
        <pub-id pub-id-type="pmcid">PMC4597325</pub-id></nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Lafferty</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Correlated topic models</article-title>
        <year>2005</year>  
        <conf-name>The 18th International Conference on Neural Information Processing Systems</conf-name>
        <conf-date>Dec 5-8, 2005</conf-date>
        <conf-loc>Vancouver, BC, Canada</conf-loc>
        <fpage>147</fpage>  
        <lpage>154</lpage> </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Arnold</surname>
            <given-names>CW</given-names>
          </name>
          <name name-style="western">
            <surname>Oh</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Speier</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Evaluating topic model interpretability from a primary care physician perspective</article-title>
        <source>Comput Methods Programs Biomed</source>  
        <year>2016</year>  
        <month>02</month>  
        <volume>124</volume>  
        <fpage>67</fpage>  
        <lpage>75</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26614020"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.cmpb.2015.10.014</pub-id>
        <pub-id pub-id-type="medline">26614020</pub-id>
        <pub-id pub-id-type="pii">S0169-2607(15)00274-6</pub-id>
        <pub-id pub-id-type="pmcid">PMC4724339</pub-id></nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>KING</surname>
            <given-names>SA</given-names>
          </name>
        </person-group>
        <article-title>Researching Internet communities: proposed ethical guidelines for the reporting of results</article-title>
        <source>Inf Soc</source>  
        <year>1996</year>  
        <month>06</month>  
        <volume>12</volume>  
        <issue>2</issue>  
        <fpage>119</fpage>  
        <lpage>128</lpage>  
        <pub-id pub-id-type="doi">10.1080/713856145</pub-id></nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Frankel</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Siang</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <source>Ethical and legal aspects of human subjects research on the Internet</source>  
        <year>1999</year>  
        <access-date>2017-07-19</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://nationalethicscenter.org/resources/187/download/ethical_legal.pdf">https://nationalethicscenter.org/resources/187/download/ethical_legal.pdf</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6s4oEE0mK"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kraut</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Olson</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Banaji</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Bruckman</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Cohen</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Couper</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Psychological research online: report of Board of Scientific Affairs' Advisory Group on the Conduct of Research on the Internet</article-title>
        <source>Am Psychol</source>  
        <year>2004</year>  
        <volume>59</volume>  
        <issue>2</issue>  
        <fpage>105</fpage>  
        <lpage>17</lpage>  
        <pub-id pub-id-type="doi">10.1037/0003-066X.59.2.105</pub-id>
        <pub-id pub-id-type="medline">14992637</pub-id>
        <pub-id pub-id-type="pii">2004-11287-003</pub-id></nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Abdaoui</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Tapi Nzali</surname>
            <given-names>MD</given-names>
          </name>
          <name name-style="western">
            <surname>Azé</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bringay</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Lavergne</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Mollevi</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Poncelet</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Analyse du sentiment, de l'opinion et de l'émotion sur des Tweets Français</article-title>
        <year>2015</year>  
        <conf-name>The 11th Workshop on Défi Fouille de Texte, Traitement Automatique des Langues Naturelles</conf-name>
        <conf-date>June 22, 2015</conf-date>
        <conf-loc>Caen, France</conf-loc>
        <fpage>78</fpage>  
        <lpage>87</lpage> </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
