<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i6e17650</article-id>
      <article-id pub-id-type="pmid">32574151</article-id>
      <article-id pub-id-type="doi">10.2196/17650</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Automatic Construction of a Depression-Domain Lexicon Based on Microblogs: Text Mining Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Hao</surname>
            <given-names>Tianyong</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Tang</surname>
            <given-names>Buzhou</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhang</surname>
            <given-names>Zhichang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Carvalho</surname>
            <given-names>Darlinton</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Genghao</given-names>
          </name>
          <degrees>MSF</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4612-2321</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Bing</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Information Technology &#38; Management</institution>
            <institution>University of International Business and Economics</institution>
            <addr-line>Chaoyang District, Huixin East Street</addr-line>
            <addr-line>Beijing, 100029</addr-line>
            <country>China</country>
            <phone>86 1 343 978 8086</phone>
            <email>01630@uibe.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3919-8275</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Langlin</given-names>
          </name>
          <degrees>BM</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9631-0334</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Hou</surname>
            <given-names>Sibing</given-names>
          </name>
          <degrees>MAFN</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4827-644X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Information Technology &#38; Management</institution>
        <institution>University of International Business and Economics</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Graduate School of Art and Science</institution>
        <institution>Columbia University</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Bing Li <email>01630@uibe.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>6</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>23</day>
        <month>6</month>
        <year>2020</year>
      </pub-date>
      <volume>8</volume>
      <issue>6</issue>
      <elocation-id>e17650</elocation-id>
      <history>
        <date date-type="received">
          <day>31</day>
          <month>12</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>4</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>5</day>
          <month>5</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Genghao Li, Bing Li, Langlin Huang, Sibing Hou. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 23.06.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://medinform.jmir.org/2020/6/e17650/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>According to a World Health Organization report in 2017, there was almost one patient with depression among every 20 people in China. However, the diagnosis of depression is usually difficult in terms of clinical detection owing to slow observation, high cost, and patient resistance. Meanwhile, with the rapid emergence of social networking sites, people tend to share their daily life and disclose inner feelings online frequently, making it possible to effectively identify mental conditions using the rich text information. There are many achievements regarding an English web-based corpus, but for research in China so far, the extraction of language features from web-related depression signals is still in a relatively primary stage.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The purpose of this study was to propose an effective approach for constructing a depression-domain lexicon. This lexicon will contain language features that could help identify social media users who potentially have depression. Our study also compared the performance of detection with and without our lexicon.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We autoconstructed a depression-domain lexicon using Word2Vec, a semantic relationship graph, and the label propagation algorithm. These two methods combined performed well in a specific corpus during construction. The lexicon was obtained based on 111,052 Weibo microblogs from 1868 users who were depressed or nondepressed. During depression detection, we considered six features, and we used five classification methods to test the detection performance.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The experiment results showed that in terms of the F1 value, our autoconstruction method performed 1% to 6% better than baseline approaches and was more effective and steadier. When applied to detection models like logistic regression and support vector machine, our lexicon helped the models outperform by 2% to 9% and was able to improve the final accuracy of potential depression detection.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our depression-domain lexicon was proven to be a meaningful input for classification algorithms, providing linguistic insights on the depressive status of test subjects. We believe that this lexicon will enhance early depression detection in people on social media. Future work will need to be carried out on a larger corpus and with more complex methods.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>depression detection</kwd>
        <kwd>depression diagnosis</kwd>
        <kwd>social media</kwd>
        <kwd>automatic construction</kwd>
        <kwd>domain-specific lexicon</kwd>
        <kwd>depression lexicon</kwd>
        <kwd>label propagation</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Depression, one of the major reasons for suicide in recent years, is a severe mental disorder characterized by persisting low mood states in the affected person. It is expected to be the largest contributor to disease burden worldwide by 2030, especially in China with a high-pressure lifestyle. According to a World Health Organization (WHO) report in 2017 [<xref ref-type="bibr" rid="ref1">1</xref>], China had more than 54 million people with depression, which means that there was almost one patient with depression among every 20 people. In addition, a national estimation based on China’s 2012 census data shows that with an adult population size of 1.04 billion, an estimated 258.41 million adults (24.79%) are at increased risk of depressive symptoms [<xref ref-type="bibr" rid="ref2">2</xref>]. It has been reported that the suicide rate among patients with depression is more than 20 times that of the general population, and patients with depression account for more than half of those who have committed suicide [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
        <p>Diagnosis of potential depression in an early stage can provide more opportunities for those affected to receive appropriate treatment and overcome the disease. However, owing to the lack of mental health knowledge, the lack of regular counseling, and the fact that mental health diseases are greatly different from physical diseases as there is no pain, many patients with depression do not recognize it. Although some know a little about depression, they are often reluctant to seek professional help because of a sense of shame [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        <p>The traditional clinical diagnosis of depression mainly relies on standardized assessments, which are highly accurate but have limitations in detection efficiency [<xref ref-type="bibr" rid="ref5">5</xref>]. The medical diagnosis requires not only filling in a depression assessment scale, such as the Self-rating Depression Scale, but also a one-to-one interview and long-term observation [<xref ref-type="bibr" rid="ref6">6</xref>], which involve high costs. Patients tend to remain undetected until the disease presents obvious symptoms, which also means that the optimal treatment period has passed [<xref ref-type="bibr" rid="ref7">7</xref>]. The whole diagnosis process is highly passive, as doctors have to wait for patients to knock on their door.</p>
        <p>Things are changing with the development of social media. Nowadays, many methods combining machine learning algorithms and text mining techniques have been developed to diagnose potential depression in an early stage [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Compared with traditional approaches, these methods have been proven to be effective and inexpensive, and have been shown to reduce limitations and assist in clinical diagnosis in a more flexible way. At the same time, people are used to disclosing their inner feelings on social media. The huge corpus provides abundant text describing things like sadness, exhaustion, and breakdown, which have the potential to reflect depression. Hundreds of millions of people in third or fourth tier cities and poor mountainous areas in China have little chance to disclose their mental conditions directly to experts, but they can provide their accounts and apply for social media methods. Experts can then intervene and conduct more targeted treatments for users who are potentially depressed. Another scenario involves teenagers on campus, and teachers can pay more attention to the actual mental status of students who are potentially depressed with the help of forums and other web-based text. It is thus feasible to detect users’ depressive mental states on a large scale on social media, and this provides convenience for expert assessment.</p>
        <p>Actually, when coping with textual depression data, word-based features like frequency and embedding are commonly used and a domain lexicon might be valuable to understand the author of the text [<xref ref-type="bibr" rid="ref14">14</xref>]. Many research studies have achieved a lot in terms of a depression lexicon, which is mainly in English [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. In China, research about web-related depression detection is just getting started, and we did not find any domain lexicon research about depression in a public study. It would not be proper to translate an English lexicon directly owing to cultural differences. Thus, a depression-domain lexicon in Chinese is needed.</p>
        <p>In this paper, based on a well-labeled depression data set on Weibo, which is one of the largest Chinese user-generated content platforms, we constructed a depression-domain lexicon containing more than 2000 words. This lexicon can be used to assist in the early diagnosis of depression. We crawled more than 144,000 microblog tweets of nearly 2000 users within a time span of 16 months to obtain depressed and nondepressed data sets. Some manual screening was implemented to remove “fake” depression microblogs from the data sets, which is clarified in the “Data Preprocessing” subsection. We extracted 80 words as seeds and then built a semantic association graph with the similarities between the seeds and candidate words and utilized the label propagation algorithm (LPA) to automatically mark new words in the graph. The LPA is a good method in such a construction, which has been further explained in the “Related Work” subsection. We then tested the effectiveness of this method and compared it with some baseline approaches. We found that this autoconstruction of a depression-domain lexicon performed the best and had the most stable performance when parameters changed. For further research, this lexicon was used as an input for machine learning algorithms, providing insights into the depressive status of test subjects, so as to improve detection accuracy. According to our research, the detection models with lexicon features outperformed the models without lexicon features by 2% to 9% in terms of evaluation scores.</p>
        <p>The main contributions are as follows: (1) We extracted a set of depressive words and constructed our domain lexicon in Chinese, which is a good contribution to web-related depression signal detection, to assist in identifying users who have the potential to experience depression in an early period. We applied an efficient semisupervised automatic construction method in the depression domain. The lexicon was proven to be meaningful in several detecting classification models in our study; (2) We constructed a benchmark depression data set (some of the data were used to construct the lexicon [our main research objective] and the other data were used in the detection test) based on microblogs, which could assist in further depression detection, diagnosis, and analysis. Meanwhile, we released the data set and lexicon together [<xref ref-type="bibr" rid="ref15">15</xref>] to facilitate future web-related depression diagnosis.</p>
      </sec>
      <sec>
        <title>Related Work About the Traditional Approach for Depression Detection</title>
        <p>For decades, there have been many ways to detect depression. Beck [<xref ref-type="bibr" rid="ref16">16</xref>] created the original Beck Depression Inventory for a quick self-testing measure that can briefly assess recent depression symptoms. Thereafter, Beck et al [<xref ref-type="bibr" rid="ref17">17</xref>] updated the approach to Beck Depression Inventory II that can assess the severity of self-reported depression symptoms by paper or electronic format. Radloff [<xref ref-type="bibr" rid="ref18">18</xref>] developed the Center for Epidemiologic Studies Depression Scale, which focuses more on the individual’s emotional experience and less on the physical condition. Some other popular scales are the Zung Self-rating Depression Scale [<xref ref-type="bibr" rid="ref19">19</xref>] and Hamilton Depression Rating Scale [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>Since the 21st century, new scales are continuously being improved. Diagnostic and Statistical Manual of Mental Disorders (DSM-IV) is a standard classification of mental disorders used by mental health professionals, which was improved by Hu [<xref ref-type="bibr" rid="ref21">21</xref>]. In China, the Chinese Classification and Diagnosis of Mental Diseases 3rd edition (CCMD-3) is a standard for diagnosis.</p>
        <p>Overall, traditional ways of depression detection have been highly validated and accepted in the real world for decades. However, they mainly rely on the scores of scales or questionnaires, face-to-face interviews, and self-reports, and often require a lot of labor and time [<xref ref-type="bibr" rid="ref6">6</xref>]. The new trend might be related to big data that are timely, rich, and easily accessible on social networking sites like Facebook, Twitter, and Weibo. These web-based methods can assist in large-scale early detection, and experts can further conduct more precise diagnosis and treatment.</p>
      </sec>
      <sec>
        <title>Related Work About Depression Detection on Social Media</title>
        <p>In recent years, with abundant data on social media, some researchers are attempting to detect depression by leveraging web-based data. Park et al [<xref ref-type="bibr" rid="ref8">8</xref>] explored the use of depressive language from Twitter users and concluded that social networks can provide meaningful data for capturing depressive moods. Choudhury et al [<xref ref-type="bibr" rid="ref9">9</xref>] were the first to diagnose and predict depression via social media by extracting several features. Hasan et al [<xref ref-type="bibr" rid="ref10">10</xref>] proposed a new method with the circumplex model to classify Twitter messages as depressed, happy, or other emotions. Resnik et al [<xref ref-type="bibr" rid="ref11">11</xref>] explored the use of supervised topic models in an analysis of linguistic signals for detecting depression. During such research, word-based features are of great importance on social media [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        <p>Word-based features could be shown in a lexicon. Tsugawa et al [<xref ref-type="bibr" rid="ref12">12</xref>] utilized positive and negative sentiment words in a lexicon for recognizing depression. Choudhury et al [<xref ref-type="bibr" rid="ref9">9</xref>] used semantic orientation pointwise mutual information (SO-PMI) and term frequency-inverse documentation frequency (TF-IDF) to extract a depression lexicon from “mental health” in Yahoo! Answers and set the Wikipedia page on “list of antidepressants” as antidepressant words. Most recently, Guangyao et al [<xref ref-type="bibr" rid="ref13">13</xref>] employed Word2Vec (W2V) to extract words of antidepressants and depression symptoms from Twitter as a domain-specific lexicon.</p>
        <p>Many previous studies achieved a lot with regard to a depression lexicon, which can greatly help in diagnosis; however, most of the words are in English. It is not proper to use the translated version of an English lexicon to detect depression in a Chinese corpus because of cultural differences. In addition, only PMI (mainly about co-occurrence frequency) and W2V (word embeddings) techniques cannot keep up with today’s semantic developments. We can see the feasibility of detecting depression on social media with a lexicon, and more efforts are needed to construct a better Chinese depression-domain lexicon.</p>
      </sec>
      <sec>
        <title>Related Work About Research on Construction of a Domain Lexicon</title>
        <p>Many methods have been used to efficiently construct a domain lexicon. Das et al [<xref ref-type="bibr" rid="ref22">22</xref>] and Krestel and Siersdorfer [<xref ref-type="bibr" rid="ref23">23</xref>] used SO-PMI as a useful tool for emotion lexicon construction. Yu and Dredze [<xref ref-type="bibr" rid="ref24">24</xref>], Tixier et al [<xref ref-type="bibr" rid="ref25">25</xref>], and Zhengyu [<xref ref-type="bibr" rid="ref26">26</xref>] leveraged and improved the W2V method to construct a domain lexicon. Chao et al [<xref ref-type="bibr" rid="ref27">27</xref>] proposed a semisupervised sentiment orientation classification algorithm based on W2V (SO-W2V) and obtained a lexicon in different areas efficiently. The PMI method focuses on the co-occurrence frequency between words but ignores the context. However, W2V considers context with word embeddings but in a relatively simple way compared with the LPA shown below.</p>
        <p>The LPA, which was first proposed by Zhu and Ghahramaniy [<xref ref-type="bibr" rid="ref28">28</xref>], plays an important role in lexicon autoconstruction with semisupervised methods. Researchers [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>] used the LPA starting with several labeled seed words to expand a lexicon for polarity classification. Tai and Kao [<xref ref-type="bibr" rid="ref31">31</xref>] built a framework to automatically generate a lexicon by combining PMI and the LPA. Hamilton et al [<xref ref-type="bibr" rid="ref32">32</xref>] applied a label propagation framework with domain-specific word embeddings to construct accurate domain-specific lexicons. A new method combining W2V and LPA was adopted by Giulianelli [<xref ref-type="bibr" rid="ref33">33</xref>] and Pu et al [<xref ref-type="bibr" rid="ref34">34</xref>], and it performed much better than previous methods. In this way, the relationships between words and specific domain contexts are considered.</p>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>In order to build a depression-domain lexicon for further detection via social media, we constructed two data sets of users with depression and without depression based on data from Weibo microblogs, which is very popular in China. Weibo has 462 million monthly active users according to a report in 2018 [<xref ref-type="bibr" rid="ref35">35</xref>], and it is the most popular social media website in China. Equivalent with Twitter, people are getting used to sharing their ideas and moods on Weibo.</p>
        <p>In light of the fact that depression is a long-standing illness, the text of users should not be collected from only one microblog. Thus, our data sets contained all Weibo microblogs within a year published by the same users. In addition, personal profile information like comments, number of follows, and number of followers was also included.</p>
        <sec>
          <title>Depressed Data Set D1</title>
          <p>Based on Weibo microblogs from January 2017 to April 2018, we used the keywords “I’m diagnosed with depression” [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>] to construct a depressed data set <italic>D</italic>1. In this way, we finally identified 965 users with depression and 58,265 microblogs (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Details of the collected data sets from Weibo microblogs.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="180"/>
              <col width="70"/>
              <col width="90"/>
              <col width="90"/>
              <col width="160"/>
              <col width="120"/>
              <col width="110"/>
              <col width="180"/>
              <thead>
                <tr valign="top">
                  <td>Data set</td>
                  <td>Users</td>
                  <td>Total posts</td>
                  <td>Mean</td>
                  <td>Standard deviation</td>
                  <td>Skewness</td>
                  <td>Kurtosis</td>
                  <td>Time span</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Depressed data set <italic>D</italic>1</td>
                  <td>965</td>
                  <td>58,265</td>
                  <td>60.374</td>
                  <td>31.327</td>
                  <td>−0.451</td>
                  <td>1.788</td>
                  <td>January 2017-April 2018 (16 months)</td>
                </tr>
                <tr valign="top">
                  <td>Nondepressed data set <italic>D</italic>2</td>
                  <td>903</td>
                  <td>52,787</td>
                  <td>63.697</td>
                  <td>30.086</td>
                  <td>−0.615</td>
                  <td>2.066</td>
                  <td>January 2017-April 2018 (16 months)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Nondepressed Data Set D2</title>
          <p>If a user never posted any text with a depression-related word like “depress,” the user was labeled as nondepressed. In this way, we constructed a nondepressed data set <italic>D</italic>2. To match <italic>D</italic>1, we selected a similar number of microblogs (one user without depression can have up to 100 posts) under the same time span. In this way, we identified 903 users without depression and 52,787 microblogs (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
        </sec>
      </sec>
      <sec>
        <title>Data Preprocessing</title>
        <p>Before the experiment, we found that there were some unrelated microblogs, irregular words, and emoji in our data sets. These noisy texts can affect the accuracy of our model, so we adopted the following preprocessing procedures: (1) <italic>Emoji processing</italic>. Emoji and emoticons are common in social media. However, they can cause some unexpected troubles like encoding problems during algorithm running and text analysis, so we removed emoji. We will take them into account separately in further research; (2) <italic>Unrelated</italic> <italic>microblog processing</italic>. In addition to depression-domain microblogs that we focused on mostly, many users posted plenty of daily microblogs, including red packets snatching, game sharing, advertisements, etc. In addition, some “fake” depression microblogs like depression scientific articles and content talking about friends with depression, instead of users, are also useless and can be misleading. By manual screening, we obtained a list of unrelated keywords in daily microblogs and “fake” microblogs, and then, we removed them all with regular expression; (3) <italic>Irregular words preprocessing.</italic> New words keep appearing, and language habits are quite different on the internet. These cause trouble during text analysis. Therefore, we added a general dictionary of internet words.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Construction Overview</title>
        <p>Domain adaptability is always a difficult problem in natural language processing. Therefore, a domain-based lexicon can help us perform analysis in a more accurate and deeper way. For example, “excitement,” “life,” and “forever” are common words in our daily life, but they can be abnormal signals of a patient with depression. Thus, through our study, we will try our best to determine which words used on the internet indicate depression and which do not indicate depression.</p>
        <p>There are many ways to construct a domain-based lexicon according to a survey [<xref ref-type="bibr" rid="ref38">38</xref>], which can mainly be divided into knowledge base, corpus base, and these two combined. Knowledge base includes traditional methods, such as word relation extension [<xref ref-type="bibr" rid="ref39">39</xref>] and annotation extension [<xref ref-type="bibr" rid="ref40">40</xref>]. Corpus base refers to conjunction syntax [<xref ref-type="bibr" rid="ref41">41</xref>] and word co-occurrence [<xref ref-type="bibr" rid="ref42">42</xref>]. In fact, a survey [<xref ref-type="bibr" rid="ref38">38</xref>] showed that the class of methods adopted more widely is the automatic method combining existing knowledge and corpus base. In this view, approaches involving semisupervised construction on relationship graphs like the LPA, bootstrapping, and word embedding are popular and effective methods mentioned in the subsection Related Work About Research on Construction of a Domain Lexicon.</p>
        <p>Inspired by Hamilton [<xref ref-type="bibr" rid="ref20">20</xref>], Giulianelli [<xref ref-type="bibr" rid="ref33">33</xref>], and Pu et al [<xref ref-type="bibr" rid="ref34">34</xref>], we applied automatic construction, a semisupervised method that combines W2V and LPA on a lexical semantic relationship graph, to obtain a depression-domain lexicon containing depressed feature words. Our construction can be divided into the following four steps: (1) <italic>Extraction of seed words</italic>. Extract a few words that are most important and valuable in the domain; (2) <italic>Extending new words</italic>. Use the W2V model to learn word vectors on the corpus and then extend with similarity between seeds and new words; (3) <italic>Setting labels for the new words</italic>. If the cosine similarity of a word and a seed is greater than the threshold, an edge will be formed, and the weight of the edge is the similarity. Through such iteration, a graph is obtained. After that, the LPA is used on the semantic graph to obtain the labels of all candidate words; (4) <italic>Obtaining the domain-based lexicon</italic>. Finally, by simple manual arrangement, the depression-domain lexicon is formed. We then used it as an input for detection models and found that the models performed much better than before. The method is described as a detailed framework in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>An illustration of the framework. DT: decision tree; LR: logistic regression; NB: naive Bayes; RF: random forest; SVM: support vector machine; TF-IDF: term frequency-inverse documentation frequency.</p>
          </caption>
          <graphic xlink:href="medinform_v8i6e17650_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Extraction of Seed Words</title>
        <p>Seed words are those that can be representative of a specific domain. In order to extract the key seed words in the depressed and nondepressed data sets, we leveraged the TF-IDF algorithm, which is a widely used feature extraction algorithm in natural language processing. Salton and Yu [<xref ref-type="bibr" rid="ref43">43</xref>] first proposed the TF-IDF algorithm, and Salton et al [<xref ref-type="bibr" rid="ref44">44</xref>] demonstrated its validity in information retrieval. Term frequency (TF) refers to the number of times a term or word occurs in a document, and inverse document frequency (IDF) is related to the frequency of a term appearing in all documents, which measures specificity of the term over the entire corpus.</p>
        <p>TF and IDF can be formulated as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>n<sub>i,j</sub></italic> is the word <italic>i</italic> in document <italic>j</italic>, <italic>k</italic> is the number of words in <italic>j</italic>, <italic>N</italic> is the number of documents containing word <italic>i</italic>, <italic>D</italic> is the size of the documents, and <italic>DF(i)</italic> is the number of documents in which the word <italic>i</italic> occurs at least once. Additionally, <italic>tfidf</italic> can be formulated as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>Intuitively, this calculation of TF-IDF will show us how important and special a given word is in our depression domain. Words with a higher <italic>tfidf</italic> value tend to have a greater relevance in a document. In our research, we regarded the data sets <italic>D</italic>1 and <italic>D</italic>2 as two corpora and every microblog as a document. We then extracted words with the highest TF-IDF values in our corpora.</p>
      </sec>
      <sec>
        <title>Extending New Words With W2V</title>
        <p>Now that we had the seeds <italic>S</italic>, we could leverage the word embedding model to extend new words. Word embeddings, which help map the vocabulary to vectors, are popular tools for natural language processing. We adopted W2V, an efficient algorithm for learning embeddings using a neural language model, to generate the vectors. W2V is an open source model by Mikolov et al [<xref ref-type="bibr" rid="ref45">45</xref>] at Google, and its main idea is to use deep learning technology on a specific corpus and then to map each word into a multidimensional real vector space, where the distance between words that have a higher semantic similarity is small.</p>
        <p>In this paper, cosine similarity was used to calculate the similarity between words. When a word whose similarity with the seed words in the training corpus was greater than the given threshold, we extracted it as a new word and added it as a candidate word to the candidate word set <italic>C</italic>. If S<italic><sub>i</sub></italic> and C<italic><sub>j</sub></italic> represent the vectors of a seed word and candidate word, respectively, the similarity between them can be formulated by <italic>SIM</italic>(<italic>Si</italic>, <italic>Cj</italic>) as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig9.png" xlink:type="simple" mimetype="image"/>
        </p>
      </sec>
      <sec>
        <title>Setting Labels With Label Propagation</title>
        <p>The LPA is a common semisupervised approach on a graph [<xref ref-type="bibr" rid="ref28">28</xref>]. It has been applied to many fields, such as community detection [<xref ref-type="bibr" rid="ref46">46</xref>], personal social relation extraction [<xref ref-type="bibr" rid="ref47">47</xref>], and dictionary construction. Using a graph model to construct a lexicon can capture the global relations among all words, overcome the dependence on seeds, and provide a better result in the case of limited labeled data.</p>
        <p>The LPA builds a graph based on the similarity between nodes, which are the words in our study. After the initialization of the graph, the nodes in the graph can be divided into labeled nodes and unknown nodes. The basic idea of LPA is to predict the label of unknown nodes based on information from labeled ones, and labels are propagated mainly by the weight on the edge between the nodes. In the process of label propagation, unknown nodes can update their own labels through the information of adjacent known labels. If the similarity of the adjacent node is large, the influence of its label will be large.</p>
        <p>In our algorithm, the seeds <italic>S</italic> are taken as the labeled nodes and the extended candidate words <italic>C</italic> are taken as the unknown nodes. The semantic graph is constructed as follows: If the seed word <italic>i</italic> is extended by W2V to get a new word <italic>j</italic>, there is an edge between <italic>i</italic> and <italic>j</italic>, and the weight of the edge is the similarity of the two words. Thus, all of the seed words and candidate words will form a semantic graph as shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <p>Assuming that there are <italic>n</italic> nodes in total, then an <italic>n</italic>-dimensional transition probability matrix can be constructed. Let <italic>SIM</italic>(<italic>w<sub>i</sub></italic>,<italic>w<sub>j</sub></italic>) represent the similarity between <italic>w<sub>i</sub></italic> and <italic>w<sub>j</sub></italic>, which is calculated by cosine similarity. <italic>T</italic>[<italic>i</italic>][<italic>j</italic>]represents the similarity transfer probability from word <italic>i</italic> to word <italic>j</italic>, which is calculated as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>If there are 10 nodes in the graph, in which <italic>i<sub>1</sub></italic> and <italic>i<sub>2</sub></italic> are depression seed words with the label “−1,” <italic>i<sub>3</sub></italic>is a nondepression seed word with the label “+1,” and the labels of the rest of the candidate words are unknown (given an initial value of 0), the initial labels of all nodes can be represented by the vector <italic>V</italic> as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>The label of each unknown candidate word is obtained by iteratively applying the transition matrix on the initial labels of the words. The calculation method is as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>Label</italic> represents the label possibility of node <italic>j</italic> after the iteration, <italic>T</italic>[<italic>i</italic>][<italic>j</italic>] represents the transfer probability in the similarity matrix of node <italic>i</italic> to node <italic>j</italic>, and <italic>V</italic>[<italic>i</italic>] represents the initial <italic>Label</italic> of node <italic>i</italic> before the iteration.</p>
        <p>In each iteration, the labels of the seeds should remain the same. When the labels of all words in the graph no longer change after continuous iteration, the iteration is over. At the end of the iteration process, the final candidate words are those words whose absolute value of label probability is greater than a certain threshold. In this way, we obtained a well-labeled domain lexicon. The previous algorithms can be concluded as the steps in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>A simple structure of a semantic graph. i: seed word; j: candidate word.</p>
          </caption>
          <graphic xlink:href="medinform_v8i6e17650_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <boxed-text id="box1" position="float">
          <title>Algorithms of the procedure.</title>
          <p><bold><italic>Input:</italic></bold> Corpus of data set (Corpus=<italic>D</italic>1∪<italic>D</italic>2), seeds <italic>S</italic>, and the threshold <italic>T<sub>c</sub></italic> for candidate words <italic>C</italic>.</p>
          <p><bold><italic>Output:</italic></bold> One depression-domain lexicon <italic>L</italic> with depressive words <italic>L<sub>d</sub></italic> and nondepressive words <italic>L<sub>n</sub></italic>.</p>
          <p>
            <bold>
              <italic>Procedure:</italic>
            </bold>
          </p>
          <p>1) Initialize the lexicon and candidate words. <italic>C</italic>=∅, <italic>L<sub>d</sub></italic>=∅, <italic>L<sub>n</sub></italic>=∅.</p>
          <p>2) Preprocess the corpus and learn the word embedding with Word2Vec.</p>
          <p>3) For every seed, <italic>S<sub>i</sub></italic>∈<italic>S</italic>:</p>
          <p>For a word <italic>W<sub>j</sub></italic> in <italic>Corpus</italic>, if <italic>SIM</italic>(<italic>S<sub>i</sub></italic>,<italic>W<sub>j</sub></italic>) ≥<italic>T<sub>c</sub></italic>, then <italic>C</italic>=<italic>C</italic>∪<italic>S</italic><sub>i</sub>∪<italic>W</italic><sub>j</sub>. Record the similarity calculated by equation (4).</p>
          <p>4) After obtaining all the extended candidate words <italic>C</italic> and the similarity matrix between words through step 3), the transition probability matrix of similarity in <italic>C</italic> can be constructed according to equation (5). Then, the semantic relationship graph is obtained.</p>
          <p>5) In the whole graph, <italic>Label</italic> of unknown words is calculated according to formula (7) given the initial label <italic>V</italic>.</p>
          <p>6) Reset the labels of the seeds in <italic>Label</italic> to its initial value. Then, let <italic>V</italic>=<italic>Label</italic>.</p>
          <p>7) Repeat steps 5) and 6) until the labels of <italic>C</italic> in the graph do not change anymore.</p>
          <p>8) Obtain the final <italic>Label</italic>. For <italic>C<sub>i</sub></italic>∈<italic>C</italic>, if <italic>Label<sub>Ci</sub></italic> &#60;0 and &#124;<italic>Label<sub>Ci</sub></italic>&#124; &#62;0.5, then <italic>L<sub>d</sub></italic>=<italic>L<sub>d</sub></italic>∪<italic>C<sub>i</sub></italic>; For <italic>C<sub>j</sub></italic>∈<italic>C</italic>, if <italic>Label<sub>Cj</sub></italic> &#62;0 and &#124;<italic>Label<sub>Cj</sub></italic>&#124; &#62;0.5, then <italic>L<sub>n</sub></italic>=<italic>L<sub>n</sub></italic>∪<italic>C<sub>j</sub></italic>.</p>
          <p>9) Combine <italic>L<sub>d</sub></italic> and <italic>L<sub>n</sub></italic> and finally obtain the depression-domain lexicon <italic>L</italic> after manual work.</p>
        </boxed-text>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Experiment Setup</title>
        <p>We employed our data set to construct a depression-domain lexicon. We needed two types of microblogs combining users with depression or those without depression to extract domain seed words and to finish the automatic construction with the LPA. Our original data crawled from Weibo had some noise, especially in <italic>D</italic>1, so manual preprocessing (detailed description in the “Data Preprocessing” subsection) was necessary to clean the data into <italic>D</italic>1 and <italic>D</italic>2.</p>
        <p>After our lexicon was automatically built, we labeled it depressed or nondepressed for further evaluation. Three volunteers, who had carefully read the depressed microblogs and research articles, were invited to perform the lexicon labeling job [<xref ref-type="bibr" rid="ref48">48</xref>]. Thus, every word in the lexicon was labeled three times. If there was a labeling disagreement, voting was adopted to obtain the ground truth.</p>
      </sec>
      <sec>
        <title>Word Segmentation</title>
        <p>Chinese word segmentation has a great influence in lexicon construction, especially when it comes to Weibo microblogs and the depression domain. In order to segment Chinese words properly in Weibo text, we used the following three steps to segment words as accurately as possible: (1) domain dictionary; (2) large word embedding; and (3) incorrect word removal.</p>
        <sec>
          <title>Domain Dictionary</title>
          <p>When coping with mental disease, especially depression, over the internet, some depression-domain words like paroxetine (“帕罗西汀”), which is a common antidepressant, and self-rating scale (“自评量表”), which is a tool for individuals to measure depression, were difficult to recognize. Other words like MLGB (“马勒戈壁”), which means damn it, and Yali (“鸭梨”), which means pressure, were network vocabularies that could be confusing for the computer. Domain-specific segmentation should combine a domain dictionary [<xref ref-type="bibr" rid="ref49">49</xref>]; however, there is no depression dictionary in public resources. To solve the segmentation problem, we downloaded “Dictionary of Psychology” and “Dictionary of Neuropsychiatry” from the CNKI Tool library [<xref ref-type="bibr" rid="ref50">50</xref>] (there is no depression lexicon yet, so we chose the dictionary of psychology and psychiatry; CNKI is one of the largest Chinese knowledge discovery web-based platforms), downloaded “Weibo Dictionary” from BosonNLP [<xref ref-type="bibr" rid="ref51">51</xref>] (a dictionary automatically constructed from millions of annotation data points from microblogs, forums, and other data sources), and used a manually collected antidepressant dictionary [<xref ref-type="bibr" rid="ref26">26</xref>] (words like amitriptyline and paroxetine in our data sets were replaced with antidepressant as a data reduction method) from web-based pharmacies and science articles. The work of Chinese domain word segmentation was inspired by Fang [<xref ref-type="bibr" rid="ref26">26</xref>] and Cheng [<xref ref-type="bibr" rid="ref49">49</xref>]. The final domain dictionary contained 122,594 words after eliminating duplicate words. We then used jieba (built to be the best Python Chinese word segmentation module) [<xref ref-type="bibr" rid="ref52">52</xref>] as our segmentation module, which adopted the unigram model and hidden Markov model.</p>
        </sec>
        <sec>
          <title>Large Word Embedding</title>
          <p>A richer corpus is associated with more precise word embedding. Instead of using our collected data, which were relatively rare, we leveraged the W2V models by Shen et al [<xref ref-type="bibr" rid="ref53">53</xref>], which are trained on 5 million Weibo microblogs and 223 million Chinese Wiki tokens, for word embeddings.</p>
        </sec>
        <sec>
          <title>Incorrect Word Removal</title>
          <p>We planned to remove incorrect words from our lexicon. Actually, after evaluation, we found that the error rate was less than 2% to 3%. Among 2385 words in our depression-domain lexicon, there were 64 errors.</p>
        </sec>
      </sec>
      <sec>
        <title>Evaluation Metrics</title>
        <p>During our experiments, we constructed the depression-domain lexicon with an automatic method, compared our method with some baseline approaches, and analyzed key parameters like number of seeds and threshold in the model.</p>
        <p>For the evaluation metrics, we employed precision, recall, and F1 measure (F1) in equations (8), (9) and (10), respectively, to evaluate the performance of our model and the baseline approaches. We used area under the curve (AUC) to evaluate the model of unbalanced data. In terms of the number of words in the lexicon, we also compared the numbers under different circumstances. The equations are as follows:</p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>
          <graphic xlink:href="medinform_v8i6e17650_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </p>
        <p>where <italic>TP</italic> represents true positive, which means depressed words are correctly detected as depressed; <italic>FN</italic> is false negative, which means depressed words are incorrectly determined as nondepressed; and <italic>FP</italic> is false positive, which means that nondepressed words are incorrectly detected as depressed.</p>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> provides an entire picture of the experiment.</p>
      </sec>
      <sec>
        <title>Seed Words</title>
        <p>Before construction, we used the TF-IDF to extract the seed words and obtained a list of the top 2000 words. The samples of the TF-IDF of <italic>D</italic>1 are shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <p>By artificially screening the list, we could obtain some seed words. Moreover, we added a few general sentiment words with high levels to our seed words and finally obtained a set of seed words of 40 depressive seeds and 40 nondepressive seeds. From parameter sensitivity analysis, we noted that 80 seeds in total will lead to a sufficiently large lexicon with high accuracy. The samples of the 80 seeds are shown in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>TF-IDF values of depressed D1 samples.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="551"/>
            <col width="449"/>
            <thead>
              <tr valign="top">
                <td>Depressed <italic>D</italic>1</td>
                <td>TF-IDF<sup>a</sup> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Myself (自己)</td>
                <td>0.041383</td>
              </tr>
              <tr valign="top">
                <td>Really (真的)</td>
                <td>0.032475</td>
              </tr>
              <tr valign="top">
                <td>Depression (抑郁症)</td>
                <td>0.024328</td>
              </tr>
              <tr valign="top">
                <td>Hope (希望)</td>
                <td>0.013336</td>
              </tr>
              <tr valign="top">
                <td>Life (生活)</td>
                <td>0.012043</td>
              </tr>
              <tr valign="top">
                <td>Forever (永远)</td>
                <td>0.006965</td>
              </tr>
              <tr valign="top">
                <td>Pain (痛苦)</td>
                <td>0.006871</td>
              </tr>
              <tr valign="top">
                <td>Sad (难过)</td>
                <td>0.006756</td>
              </tr>
              <tr valign="top">
                <td>Live (活着)</td>
                <td>0.006583</td>
              </tr>
              <tr valign="top">
                <td>Mood (心情)</td>
                <td>0.006386</td>
              </tr>
              <tr valign="top">
                <td>Night (晚上)</td>
                <td>0.006347</td>
              </tr>
              <tr valign="top">
                <td>Always (总是)</td>
                <td>0.005984</td>
              </tr>
              <tr valign="top">
                <td>Hate (讨厌)</td>
                <td>0.005475</td>
              </tr>
              <tr valign="top">
                <td>Exhausted (好累)</td>
                <td>0.005469</td>
              </tr>
              <tr valign="top">
                <td>Fear (害怕)</td>
                <td>0.005030</td>
              </tr>
              <tr valign="top">
                <td>Lonely (孤独)</td>
                <td>0.004413</td>
              </tr>
              <tr valign="top">
                <td>Idiot (傻逼)</td>
                <td>0.004380</td>
              </tr>
              <tr valign="top">
                <td>Emotion (感情)</td>
                <td>0.004031</td>
              </tr>
              <tr valign="top">
                <td>Insomnia (失眠)</td>
                <td>0.003950</td>
              </tr>
              <tr valign="top">
                <td>Sorry (对不起)</td>
                <td>0.003867</td>
              </tr>
              <tr valign="top">
                <td>Despair (绝望)</td>
                <td>0.003410</td>
              </tr>
              <tr valign="top">
                <td>Antidepressant (抗抑郁药)</td>
                <td>0.002305</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>TF-IDF: term frequency-inverse documentation frequency.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Summary of the seeds.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="203"/>
            <col width="797"/>
            <thead>
              <tr valign="top">
                <td>Category</td>
                <td>Seeds <italic>S</italic></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Nondepressive (40 words)</td>
                <td>Stability, comfort, happy, happiness, successful, confidence, sunshine, struggle, positive, brave, enjoy, peace, enthusiasm, healthy, satisfied, active, grow up, pride, good, admire, strong, perfect, praise, precious, progress, congratulate, love, welcome, kindness, robust, earnest, agree, support, award, advantage, good deal, develop, warm, bright colored, and understand<break/>(稳定, 舒服, 高兴, 幸福, 顺利, 自信, 阳光, 奋斗, 积极, 勇敢, 享受, 平安, 热情, 健康, 满意, 活力, 成长, 骄傲, 优秀, 敬佩, 完美, 称赞, 强大, 珍贵, 进步, 庆贺, 关爱, 欢迎, 强壮, 善良, 认真, 同意, 支持, 奖励, 优势, 划算, 发展, 温暖, 鲜艳, 明白)</td>
              </tr>
              <tr valign="top">
                <td>Depressive (40 words)</td>
                <td>Depression, collapse, stress, suicide, apastia, anxious, sad, tired, death, lonely, insomnia, bad, desperate, give up, low, leave, fear, danger, close, sensitive, lost, shadow, destroy, suspect, crash, dark, helpless, guilt, negative, frustration, nervous, melancholy, rubbish, jump, forget, goodbye, cut wrist, edge, haze, and antidepressant<break/>(抑郁, 崩溃, 压力, 自杀, 绝食, 焦躁, 伤心, 疲惫, 死亡, 孤独, 失眠, 难受, 绝望, 放弃, 卑微, 离开, 恐惧, 危险, 封闭, 敏感, 茫然, 阴影, 摧毁, 怀疑, 崩塌, 黑暗, 无助, 愧疚, 负面, 沮丧, 紧张, 忧郁, 废物, 跳楼, 遗忘, 再见, 割腕, 边缘, 阴霾, 抗抑郁药)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Evaluation</title>
        <p>In order to verify the effectiveness of the lexicon autoconstruction method applied in this paper, we selected the following methods as baseline approaches: (1) <italic>W2V</italic> [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. A common method of constructing a lexicon based on W2V, which is used to learn word embedding vectors on a corpus. The semantic similarity between words and seed words in the corpus is then iteratively calculated. If the similarity is greater than a certain threshold, the new word is extended and has the same label as the seed word; (2) <italic>SO-W2V</italic> [<xref ref-type="bibr" rid="ref27">27</xref>]. It is a semisupervised sentiment orientation classification algorithm based on a word vector. The basic idea is that through comparison with all positive and negative seed words, an accurate orientation of the extended word will be obtained. It has versatility in different areas for a Chinese corpus; (3) <italic>SO-PMI</italic> [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. It calculates the probability of the occurrence of both seed words and expanded words in the text. A higher probability is associated with a closer correlation; (4) <italic>W2V-LPA</italic>. It is our method, which considers both the word relationship and specific domain context.</p>
        <p>To obtain a fair comparison, we set the same parameters for all methods where <italic>T<sub>c</sub></italic> was 0.5 and the size of seeds <italic>S</italic> was 80. For W2V tools, we used the gensim package [<xref ref-type="bibr" rid="ref54">54</xref>].</p>
        <p>From <xref ref-type="table" rid="table4">Table 4</xref> and <xref rid="figure3" ref-type="fig">Figure 3</xref>, we can see the evaluation results. It is obvious that the W2V-LPA and W2V methods performed much better than the SO-W2V and SO-PMI methods. Moreover, when the size of seeds increased from 60 to 120, our method was able to maintain a more stable and precise performance, which was almost 1% to 6% higher than others (<xref rid="figure3" ref-type="fig">Figure 3</xref>), whereas the value for SO-W2V declined quickly when the size of seeds became larger. Overall, SO-W2V takes all the other seeds into account, but too many seeds combined will introduce too much noise to some extent, as not all seeds are related to an extended word. W2V is a simple and general method, which only considers the label of the first seed when extending new words. Additionally, SO-PMI mainly takes word co-occurrence frequency into account. What W2V-LPA did better is that it only predicted labels through the semantic graph of related and similar words, and thus, the semantic context and word relation were both considered. Therefore, we can say that W2V-LPA is a much better and more stable method for the autoconstruction of a domain lexicon.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Performance of lexicon construction methods.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="290"/>
            <col width="210"/>
            <col width="120"/>
            <col width="120"/>
            <col width="260"/>
            <thead>
              <tr valign="top">
                <td>Construction method</td>
                <td>Precision</td>
                <td>Recall</td>
                <td>F1</td>
                <td>Size of the lexicon</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>W2V-LPA<sup>a</sup></td>
                <td>0.880</td>
                <td>0.906</td>
                <td>0.893</td>
                <td>2321</td>
              </tr>
              <tr valign="top">
                <td>W2V<sup>b</sup></td>
                <td>0.878</td>
                <td>0.903</td>
                <td>0.890</td>
                <td>2321</td>
              </tr>
              <tr valign="top">
                <td>SO-PMI<sup>c</sup></td>
                <td>0.879</td>
                <td>0.877</td>
                <td>0.877</td>
                <td>2024</td>
              </tr>
              <tr valign="top">
                <td>SO-W2V<sup>d</sup></td>
                <td>0.854</td>
                <td>0.877</td>
                <td>0.862</td>
                <td>2321</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>W2V-LPA: label propagation algorithm-Word2Vec.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>W2V: Word2Vec.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>SO-PMI: semantic orientation pointwise mutual information.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>SO-W2V: semantic orientation Word2Vec.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>F1 of methods when the seed size changed. LPA: label propagation algorithm; SO-PMI: semantic orientation pointwise mutual information; SO-W2V: semantic orientation from Word2Vec; W2V: Word2Vec.</p>
          </caption>
          <graphic xlink:href="medinform_v8i6e17650_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Parameter Sensitivity Analysis</title>
        <p>Throughout our experiment, the size of seeds <italic>S</italic> and the extension threshold <italic>T<sub>c</sub></italic> were two important parameters. More seeds or a lower threshold will lead to a lexicon with more words but lower accuracy, whereas fewer seeds and a high threshold will ensure more precision but a poor lexicon. We balanced the trade-offs, as we wanted to obtain a relatively accurate and abundant lexicon that would be helpful for further depression diagnosis. <xref rid="figure4" ref-type="fig">Figure 4</xref> presents the size of the lexicon when the size of seeds and threshold for candidate words changed.</p>
        <p>First, we fixed <italic>T<sub>c</sub></italic> at 0.7 and then varied the size of seeds from 60 to 120. If we have less than 60 seeds, the entire lexicon will be so small that almost nothing will remain but seed words. A size larger than 60 will not change the outcome, so 0.7 might be a very high-level threshold. From <xref ref-type="table" rid="table5">Table 5</xref>, we can see that larger sizes of seeds like 100 and 120 partially jeopardized the performance, and W2V-LPA performed nearly the same when the sizes were 60 and 80.</p>
        <p>We then fixed the size of seeds at 80 with varying <italic>T<sub>c</sub></italic> from 0.7 to 0.5. With a higher threshold, the performance was relatively excellent, whereas the size of the lexicon started to fail at around 1000 when <italic>T<sub>c</sub></italic> was 0.55. We believe a lexicon with 2000 words and a <italic>T<sub>c</sub></italic> of 0.5 might have good balance.</p>
        <p>Overall, it is pleasing that our W2V-LPA method performed quite smoothly and steadily even when the parameters were changed, so we believe that a high-quality lexicon can be constructed. It is difficult to find an optimal solution, and given <italic>D</italic>1 and <italic>D</italic>2, we will adopt a size of seeds of 80 and a threshold <italic>T<sub>c</sub></italic> of 0.5 as a relatively proper approach.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Size of the lexicon when the size of seeds and threshold for candidate words changed.</p>
          </caption>
          <graphic xlink:href="medinform_v8i6e17650_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Performance of the W2V-LPA method when S and <italic>T<sub>c</sub></italic> were changed.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <italic>S</italic>
                  <sup>a</sup>
                </td>
                <td>
                  <italic>T<sub>c</sub></italic>
                  <sup>b</sup>
                </td>
                <td>Precision</td>
                <td>Recall</td>
                <td>F1</td>
                <td>Size of the lexicon</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>60</td>
                <td>0.5</td>
                <td>0.882</td>
                <td>0.911</td>
                <td>0.896</td>
                <td>1694</td>
              </tr>
              <tr valign="top">
                <td>60</td>
                <td>0.55</td>
                <td>0.910</td>
                <td>0.935</td>
                <td>0.922</td>
                <td>788</td>
              </tr>
              <tr valign="top">
                <td>60</td>
                <td>0.6</td>
                <td>0.926</td>
                <td>0.944</td>
                <td>0.935</td>
                <td>446</td>
              </tr>
              <tr valign="top">
                <td>60</td>
                <td>0.65</td>
                <td>0.951</td>
                <td>0.963</td>
                <td>0.954</td>
                <td>275</td>
              </tr>
              <tr valign="top">
                <td>60</td>
                <td>0.7</td>
                <td>0.804</td>
                <td>0.897</td>
                <td>0.848</td>
                <td>89</td>
              </tr>
              <tr valign="top">
                <td>80</td>
                <td>0.5</td>
                <td>0.880</td>
                <td>0.906</td>
                <td>0.893</td>
                <td>2321</td>
              </tr>
              <tr valign="top">
                <td>80</td>
                <td>0.55</td>
                <td>0.916</td>
                <td>0.937</td>
                <td>0.926</td>
                <td>1072</td>
              </tr>
              <tr valign="top">
                <td>80</td>
                <td>0.6</td>
                <td>0.934</td>
                <td>0.948</td>
                <td>0.941</td>
                <td>558</td>
              </tr>
              <tr valign="top">
                <td>80</td>
                <td>0.65</td>
                <td>0.954</td>
                <td>0.963</td>
                <td>0.958</td>
                <td>320</td>
              </tr>
              <tr valign="top">
                <td>80</td>
                <td>0.7</td>
                <td>0.918</td>
                <td>0.909</td>
                <td>0.892</td>
                <td>113</td>
              </tr>
              <tr valign="top">
                <td>100</td>
                <td>0.5</td>
                <td>0.874</td>
                <td>0.899</td>
                <td>0.886</td>
                <td>3070</td>
              </tr>
              <tr valign="top">
                <td>100</td>
                <td>0.55</td>
                <td>0.906</td>
                <td>0.924</td>
                <td>0.915</td>
                <td>1589</td>
              </tr>
              <tr valign="top">
                <td>100</td>
                <td>0.6</td>
                <td>0.927</td>
                <td>0.937</td>
                <td>0.931</td>
                <td>792</td>
              </tr>
              <tr valign="top">
                <td>100</td>
                <td>0.65</td>
                <td>0.953</td>
                <td>0.959</td>
                <td>0.955</td>
                <td>418</td>
              </tr>
              <tr valign="top">
                <td>100</td>
                <td>0.7</td>
                <td>0.937</td>
                <td>0.932</td>
                <td>0.925</td>
                <td>144</td>
              </tr>
              <tr valign="top">
                <td>120</td>
                <td>0.5</td>
                <td>0.855</td>
                <td>0.879</td>
                <td>0.866</td>
                <td>3696</td>
              </tr>
              <tr valign="top">
                <td>120</td>
                <td>0.55</td>
                <td>0.889</td>
                <td>0.904</td>
                <td>0.896</td>
                <td>1942</td>
              </tr>
              <tr valign="top">
                <td>120</td>
                <td>0.6</td>
                <td>0.924</td>
                <td>0.933</td>
                <td>0.928</td>
                <td>894</td>
              </tr>
              <tr valign="top">
                <td>120</td>
                <td>0.65</td>
                <td>0.952</td>
                <td>0.958</td>
                <td>0.954</td>
                <td>454</td>
              </tr>
              <tr valign="top">
                <td>120</td>
                <td>0.7</td>
                <td>0.944</td>
                <td>0.940</td>
                <td>0.934</td>
                <td>170</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup><italic>S</italic>: size of the seeds.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup><italic>T<sub>c</sub></italic>: threshold for candidate words.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Detection Performance</title>
        <p>After construction of the depression-domain lexicon, we could apply it to actual depression detection in a new Weibo microblog data set to find out if our work would help existing detection models perform better. The detection process included data collection, feature selection, and classification methods.</p>
        <sec>
          <title>Data Collection</title>
          <p>In addition to our data set used for lexicon construction, we collected 745 users who were depressed and 10,118 users who were not depressed with their 1-year tweets as a new data set. Data details are shown in <xref ref-type="table" rid="table6">Table 6</xref>.</p>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>Details of the data set for depression detection.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="200"/>
              <col width="90"/>
              <col width="130"/>
              <col width="100"/>
              <col width="150"/>
              <col width="130"/>
              <col width="100"/>
              <col width="100"/>
              <thead>
                <tr valign="top">
                  <td>Data set</td>
                  <td>Users</td>
                  <td>Total posts</td>
                  <td>Mean</td>
                  <td>Standard deviation</td>
                  <td>Skewness</td>
                  <td>Kurtosis</td>
                  <td>Time span</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Depressed data set <italic>D</italic>3</td>
                  <td>745</td>
                  <td>179,600</td>
                  <td>240.44</td>
                  <td>486.28</td>
                  <td>6.21</td>
                  <td>56.32</td>
                  <td>January 2018-June 2019 (18 months)</td>
                </tr>
                <tr valign="top">
                  <td>Nondepressed data set <italic>D</italic>4</td>
                  <td>10,118</td>
                  <td>3,150,000</td>
                  <td>310.93</td>
                  <td>327.72</td>
                  <td>3.50</td>
                  <td>48.52</td>
                  <td>January 2018-June 2019<break/>(18 months)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Feature Selection</title>
          <p>Features like topic-level keywords, posting behaviors, number of tweets, first-person words, and linguistic style are meaningful in detecting depression on the internet [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. We also set our depression-domain lexicon as one feature to see whether it would really contribute a lot after inclusion in the detection model. The features were as follows: (1) <italic>Topic-level keywords.</italic> We selected 30 topic-level keywords with the TF-IDF; (2) <italic>Posting behaviors.</italic> For each user, average length of tweets and total posting numbers were collected to represent web-related posting behaviors; (3) <italic>First-person words.</italic> According to linguistic inquiry and word count [<xref ref-type="bibr" rid="ref55">55</xref>], we counted the number of first-person pronouns like I, we, us, etc; (4) <italic>Linguistic style (200 dimensions).</italic> To approximately analyze linguistic style, we calculated the average vectors of every user with Word2Vec [<xref ref-type="bibr" rid="ref56">56</xref>]. Finally, we constructed the depression-domain lexicon by the previously mentioned process.</p>
        </sec>
        <sec>
          <title>Classification Methods</title>
          <p>We chose naive Bayes (NB), decision tree, logistic regression (LR), random forest, and support vector machine [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref37">37</xref>] as classification methods to detect users with depression. From model performance, we obtained a quick picture about the importance of our lexicon. When the depression-domain lexicon is selected as one feature, the method has the tag L. For example, L-NB is a classification algorithm that has the feature of the depression-domain lexicon, whereas NB does not have this feature. After including the depression-domain lexicon in the models, we clearly found that each detection performance improved when compared with before inclusion of the lexicon (<xref ref-type="table" rid="table7">Table 7</xref>). The performance of lexicon methods surpassed that of corresponding methods without the lexicon by 2% to 9%, which justifies the important role of our lexicon in depression detection.</p>
          <p>The model was based on a data set with 50% users who were depressed and 50% users who were not depressed. When we varied the scale of depressed users, the data set became imbalanced and the AUC was more important to test the performance. <xref rid="figure5" ref-type="fig">Figure 5</xref> illustrates the trend of detecting performance when setting different proportions of users who were depressed in the L-LR method. This method achieved an outstanding performance when the proportion of users with depression was 50%. However, the AUC dropped sharply when the data set was imbalanced.</p>
          <p>In the real word, people with depression make up less than 10% of the population, and we will determine how to properly detect depression with imbalanced data in a further study.</p>
          <table-wrap position="float" id="table7">
            <label>Table 7</label>
            <caption>
              <p>Detection model performance with the depression-domain lexicon.</p>
            </caption>
            <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
              <thead>
                <tr valign="top">
                  <td>Detection model</td>
                  <td>Precision</td>
                  <td>Recall</td>
                  <td>F1</td>
                  <td>Accuracy</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>NB<sup>a</sup></td>
                  <td>67%</td>
                  <td>67%</td>
                  <td>67%</td>
                  <td>67%</td>
                </tr>
                <tr valign="top">
                  <td>L<sup>b</sup>-NB</td>
                  <td>74%</td>
                  <td>73%</td>
                  <td>73%</td>
                  <td>73%</td>
                </tr>
                <tr valign="top">
                  <td>LR<sup>c</sup></td>
                  <td>76%</td>
                  <td>76%</td>
                  <td>75%</td>
                  <td>76%</td>
                </tr>
                <tr valign="top">
                  <td>L-LR</td>
                  <td>77%</td>
                  <td>77%</td>
                  <td>77%</td>
                  <td>77%</td>
                </tr>
                <tr valign="top">
                  <td>RF<sup>d</sup></td>
                  <td>68%</td>
                  <td>68%</td>
                  <td>68%</td>
                  <td>68%</td>
                </tr>
                <tr valign="top">
                  <td>L-RF</td>
                  <td>77%</td>
                  <td>77%</td>
                  <td>76%</td>
                  <td>77%</td>
                </tr>
                <tr valign="top">
                  <td>SVM<sup>e</sup></td>
                  <td>65%</td>
                  <td>65%</td>
                  <td>65%</td>
                  <td>65%</td>
                </tr>
                <tr valign="top">
                  <td>L-SVM</td>
                  <td>74%</td>
                  <td>72%</td>
                  <td>72%</td>
                  <td>72%</td>
                </tr>
                <tr valign="top">
                  <td>DT<sup>f</sup></td>
                  <td>67%</td>
                  <td>67%</td>
                  <td>67%</td>
                  <td>67%</td>
                </tr>
                <tr valign="top">
                  <td>L-DT</td>
                  <td>69%</td>
                  <td>69%</td>
                  <td>69%</td>
                  <td>69%</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table7fn1">
                <p><sup>a</sup>NB: naive Bayes.</p>
              </fn>
              <fn id="table7fn2">
                <p><sup>b</sup>L: depression-domain lexicon as a feature.</p>
              </fn>
              <fn id="table7fn3">
                <p><sup>c</sup>LR: logistic regression.</p>
              </fn>
              <fn id="table7fn4">
                <p><sup>d</sup>RF: random forest.</p>
              </fn>
              <fn id="table7fn5">
                <p><sup>e</sup>SVM: support vector machine.</p>
              </fn>
              <fn id="table7fn6">
                <p><sup>f</sup>DT: decision tree.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Scales of users who were depressed. AUC: area under the curve.</p>
            </caption>
            <graphic xlink:href="medinform_v8i6e17650_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>Diagnosis of users with potential depression via social media has attracted increasing attention because it is a more cost-effective and active approach dealing with massive valuable data than traditional diagnosis. In previous studies, most of the achievements about a lexicon involved an English corpus. Instead of translating an English lexicon, this paper aimed to apply an automatic construction method for a Chinese depression-domain lexicon based on the LPA. With Word2Vec and a semantic relationship graph, the LPA was used to predict the label of candidate words in the graph, and finally, our lexicon was constructed. Experiment results showed that our method was superior to baseline construction methods and had good performance and robustness. In addition, when our lexicon was included as an input for the detection models, their performance became more accurate and effective when compared with the models without the depression-domain lexicon.</p>
      <p>In the next step, experiments are expected to be carried out on a larger depression corpus, and more linguistic knowledge like conjunction will be incorporated into our method to enlarge the range of the depression-domain lexicon. Meanwhile, more complex construction methods like deep neural networks and hierarchical topic models will be adopted in further research. We expect that our lexicon will act as a useful feature in depression detection and will be able to provide more insights for depression diagnosis in terms of advanced depression detection among patients.</p>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">DT</term>
          <def>
            <p>decision tree</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LPA</term>
          <def>
            <p>label propagation algorithm</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NB</term>
          <def>
            <p>naive Bayes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SO-PMI</term>
          <def>
            <p>semantic orientation pointwise mutual information</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">SO-W2V</term>
          <def>
            <p>semantic orientation from Word2Vec</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">TF-IDF</term>
          <def>
            <p>term frequency-inverse documentation frequency</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">W2V</term>
          <def>
            <p>Word2Vec</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>GL led the method application, experiment conduction, and result analysis. LH and SH participated in data extraction, preprocessing, and manuscript revision. BL provided theoretical guidance and revised the paper. This work was supported by the National Social Science Fund Project, China (No. 16BTQ065) “Multi-source intelligence fusion research on emergencies in big data environment” and the Foundation for Disciplinary Development of the School of Information and Technology in the University of International Business and Economics.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <source>World Health Organization</source>
          <access-date>2020-06-05</access-date>
          <comment>World Health Statistics 2017: Monitoring health for the SDGs<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/gho/publications/world_health_statistics/2017/en/">https://www.who.int/gho/publications/world_health_statistics/2017/en/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hsieh</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>The prevalence of depression and depressive symptoms among adults in China: Estimation based on a National Household Survey</article-title>
          <source>China Economic Review</source>
          <year>2018</year>
          <month>10</month>
          <volume>51</volume>
          <fpage>271</fpage>
          <lpage>282</lpage>
          <pub-id pub-id-type="doi">10.1016/j.chieco.2016.04.001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lépine</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Briley</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The increasing burden of depression</article-title>
          <source>NDT</source>
          <year>2011</year>
          <month>05</month>
          <volume>7</volume>
          <fpage>3</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.2147/ndt.s19617</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Loughlin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Neary</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Adkins</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Schueller</surname>
              <given-names>SM</given-names>
            </name>
          </person-group>
          <article-title>Reviewing the data security and privacy policies of mobile apps for depression</article-title>
          <source>Internet Interv</source>
          <year>2019</year>
          <month>03</month>
          <volume>15</volume>
          <fpage>110</fpage>
          <lpage>115</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2214-7829(18)30046-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.invent.2018.12.001</pub-id>
          <pub-id pub-id-type="medline">30792962</pub-id>
          <pub-id pub-id-type="pii">S2214-7829(18)30046-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6371412</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rahman</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Omar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Noah</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Mohd Shahrul Nizam Mohd Danuri</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <article-title>A Survey on Mental Health Detection in Online Social Network</article-title>
          <source>International Journal on Advanced Science, Engineering and Information Technology</source>
          <year>2018</year>
          <volume>8</volume>
          <fpage>1431</fpage>
          <lpage>1436</lpage>
          <pub-id pub-id-type="doi">10.18517/ijaseit.8.4-2.6830</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Qiu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chua</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Detecting Stress Based on Social Interactions in Social Networks</article-title>
          <source>IEEE Trans. Knowl. Data Eng</source>
          <year>2017</year>
          <month>9</month>
          <day>1</day>
          <volume>29</volume>
          <issue>9</issue>
          <fpage>1820</fpage>
          <lpage>1833</lpage>
          <pub-id pub-id-type="doi">10.1109/tkde.2017.2686382</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cepoiu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>McCusker</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cole</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Sewitch</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Belzile</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ciampi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Recognition of depression by non-psychiatric physicians--a systematic literature review and meta-analysis</article-title>
          <source>J Gen Intern Med</source>
          <year>2008</year>
          <month>01</month>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>25</fpage>
          <lpage>36</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17968628"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-007-0428-5</pub-id>
          <pub-id pub-id-type="medline">17968628</pub-id>
          <pub-id pub-id-type="pmcid">PMC2173927</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Chiyoung</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Meeyoung</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Depressive moods of users portrayed in twitter</article-title>
          <year>2012</year>
          <conf-name>Proceedings of the ACM SIGKDD Workshop On Healthcare Informatics (HI-KDD)</conf-name>
          <conf-date>2012</conf-date>
          <conf-loc>San Diego</conf-loc>
          <fpage>1</fpage>
          <lpage>8</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choudhury</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gamon</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Counts</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Horvitz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Predicting Depression via Social Media</article-title>
          <year>2013</year>
          <month>7</month>
          <conf-name>International AAAI Conference on Weblogs and Social Media</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Cambridge, Massachusetts, USA</conf-loc>
          <fpage>128</fpage>
          <lpage>137</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hasan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rundensteiner</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Agu</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>EMOTEX: Detecting Emotions in Twitter Messages</article-title>
          <year>2014</year>
          <month>04</month>
          <conf-name>ASE BigData/SocialCom/CyberSecurity Conference</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Stanford, California, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Resnik</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Armstrong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Claudino</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>VA</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd-Graber</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Beyond LDA: Exploring Supervised Topic Modeling for Depression-Related Language in Twitter</article-title>
          <year>2015</year>
          <conf-name>The 2nd Workshop on Computational Linguistics and Clinical Psychology: From Linguistic Signal to Clinical Reality</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>Denver, Colorado, USA</conf-loc>
          <fpage>99</fpage>
          <lpage>107</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/w15-1212</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsugawa</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kikuchi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kishino</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Nakajima</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Itoh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ohsaki</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Recognizing Depression from Twitter Activity</article-title>
          <source>ACM</source>
          <year>2015</year>
          <conf-name>The 33rd Annual ACM Conference on Human Factors in Computing Systems</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>Seoul, Republic of Korea</conf-loc>
          <fpage>3187</fpage>
          <lpage>3196</lpage>
          <pub-id pub-id-type="doi">10.1145/2702123.2702280</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guangyao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liqiang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fuli</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cunjun</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tianrui</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tat-Seng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wenwu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Depression Detection via Harvesting Social Media: A Multimodal Dictionary Learning Solution</article-title>
          <year>2017</year>
          <conf-name>The Twenty-Sixth International Joint Conference on Artificial Intelligence</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
          <fpage>3838</fpage>
          <lpage>3844</lpage>
          <pub-id pub-id-type="doi">10.24963/ijcai.2017/536</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Losada</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Gamallo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Evaluating and improving lexical resources for detecting signs of depression in text</article-title>
          <source>Lang Resources &#38; Evaluation</source>
          <year>2018</year>
          <month>8</month>
          <day>6</day>
          <volume>54</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>24</lpage>
          <pub-id pub-id-type="doi">10.1007/s10579-018-9423-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <source>GitHub</source>
          <access-date>2020-04-15</access-date>
          <comment>Chinese-Depression-domain-Lexicon<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/omfoggynight/Chinese-Depression-domain-Lexicon">https://github.com/omfoggynight/Chinese-Depression-domain-Lexicon</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beck</surname>
              <given-names>AT</given-names>
            </name>
          </person-group>
          <article-title>An inventory for measuring depression</article-title>
          <source>Arch Gen Psychiatry</source>
          <year>1961</year>
          <month>06</month>
          <volume>4</volume>
          <fpage>561</fpage>
          <lpage>71</lpage>
          <pub-id pub-id-type="doi">10.1001/archpsyc.1961.01710120031004</pub-id>
          <pub-id pub-id-type="medline">13688369</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beck</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Steer</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>GK</given-names>
            </name>
          </person-group>
          <source>Manual for the Beck Depression Inventory-II</source>
          <year>1996</year>
          <publisher-loc>San Antonio, Texas</publisher-loc>
          <publisher-name>Psychological Corporation</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radloff</surname>
              <given-names>LS</given-names>
            </name>
          </person-group>
          <article-title>The CES-D Scale</article-title>
          <source>Applied Psychological Measurement</source>
          <year>2016</year>
          <month>07</month>
          <day>26</day>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>385</fpage>
          <lpage>401</lpage>
          <pub-id pub-id-type="doi">10.1177/014662167700100306</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zung</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>A self-rating depression scale</article-title>
          <source>Arch Gen Psychiatry</source>
          <year>1965</year>
          <month>01</month>
          <volume>12</volume>
          <fpage>63</fpage>
          <lpage>70</lpage>
          <pub-id pub-id-type="doi">10.1001/archpsyc.1965.01720310065008</pub-id>
          <pub-id pub-id-type="medline">14221692</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hamilton</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Development of a rating scale for primary depressive illness</article-title>
          <source>Br J Soc Clin Psychol</source>
          <year>1967</year>
          <month>12</month>
          <volume>6</volume>
          <issue>4</issue>
          <fpage>278</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1111/j.2044-8260.1967.tb00530.x</pub-id>
          <pub-id pub-id-type="medline">6080235</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic and Statistical Manual of Mental Disorders (DSM-IV)</article-title>
          <source>Encyclopedia of the Neurological Sciences</source>
          <year>2003</year>
          <volume>25</volume>
          <issue>2</issue>
          <fpage>4</fpage>
          <lpage>8</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Das</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Poria</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bandyopadhyay</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A Classifier Based Approach to Emotion Lexicon Construction</article-title>
          <source>Natural Language Processing and Information Systems</source>
          <year>2012</year>
          <volume>7337</volume>
          <pub-id pub-id-type="doi">10.1007/978-3-642-31178-9_41</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krestel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Siersdorfer</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Generating Contextualized Sentiment Lexica based on Latent Topics and User Ratings</article-title>
          <source>Proceedings of the 24th ACM Conference on Hypertext and Social Media</source>
          <year>2013</year>
          <conf-name>The 24th ACM Conference on Hypertext and Social Media</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Paris, France</conf-loc>
          <fpage>01</fpage>
          <lpage>2013</lpage>
          <pub-id pub-id-type="doi">10.1145/2481492.2481506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Improving Lexical Embeddings with Semantic Knowledge</article-title>
          <year>2014</year>
          <conf-name>The 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Baltimore, Maryland, USA</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/p14-2089</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tixier</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Vazirgiannis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hallowell</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Word Embeddings for the Construction Domain</article-title>
          <source>arXiv e-prints</source>
          <year>2016</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1610.09333"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhenyu</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Research on depression prediction of micro-blog users based on word embedding method</article-title>
          <source>Electronic Technology &#38; Software Engineering</source>
          <year>2017</year>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chao</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Xun</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yaping</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Construction Method of Chinese Cross-Domain Sentiment Lexicon Based on Word Vector</article-title>
          <source>Journal of Data Acquisition and Processing</source>
          <year>2017</year>
          <fpage>579</fpage>
          <lpage>587</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiaojin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ghahramani</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Learning from Labeled and Unlabeled Data with Label Propagation</article-title>
          <source>Tech Report</source>
          <year>2002</year>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ravichandran</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Semi-Supervised Polarity Lexicon Induction</article-title>
          <year>2009</year>
          <conf-name>12th Conference of the European Chapter of the Association for Computational Linguistics</conf-name>
          <conf-date>2009</conf-date>
          <conf-loc>Athens, Greece</conf-loc>
          <pub-id pub-id-type="doi">10.3115/1609067.1609142</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brody</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Elhadad</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>An Unsupervised Aspect-Sentiment Model for Online Reviews</article-title>
          <year>2010</year>
          <conf-name>Human Language Technologies: Conference of the North American Chapter of the Association of Computational Linguistics</conf-name>
          <conf-date>2010</conf-date>
          <conf-loc>Los Angeles, California, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yen-Jen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hung-Yu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Automatic Domain-Specific Sentiment Lexicon Generation with Label Propagation</article-title>
          <source>Proceedings of International Conference on Information Integration and Web-based Applications &#38; Services</source>
          <year>2013</year>
          <conf-name>Conference on Information Integration and Web-based Applications &#38; Services</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Vienna, Austria</conf-loc>
          <fpage>02</fpage>
          <lpage>2013</lpage>
          <pub-id pub-id-type="doi">10.1145/2539150.2539190</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hamilton</surname>
              <given-names>WL</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Leskovec</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jurafsky</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Inducing Domain-Specific Sentiment Lexicons from Unlabeled Corpora</article-title>
          <source>Proc Conf Empir Methods Nat Lang Process</source>
          <year>2016</year>
          <month>11</month>
          <volume>2016</volume>
          <fpage>595</fpage>
          <lpage>605</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/D16-1057</pub-id>
          <pub-id pub-id-type="medline">28660257</pub-id>
          <pub-id pub-id-type="pmcid">PMC5483533</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giulianelli</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised emotion lexicon expansion with label propagation and specialized word embeddings</article-title>
          <source>arXiv e-prints</source>
          <year>2017</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1708.03910"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Junxia</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yinghao</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Sentiment Lexicon Construction Method Based on Label Propagation</article-title>
          <source>Computer Engineering</source>
          <year>2018</year>
          <volume>44</volume>
          <issue>5</issue>
          <fpage>168</fpage>
          <lpage>173</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <source>Sina Weibo</source>
          <access-date>2020-06-05</access-date>
          <comment>2018 Weibo User Development Report<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://data.weibo.com/report/reportDetail?id=433">https://data.weibo.com/report/reportDetail?id=433</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Coppersmith</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Craig</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Quantifying Mental Health Signals in Twitter</article-title>
          <year>2014</year>
          <conf-name>Workshop on Computational Linguistics and Clinical Psychology: From Linguistic Signal to Clinical Reality</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Baltimore, Maryland, USA</conf-loc>
          <fpage>51</fpage>
          <lpage>60</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/W14-3207</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tiancheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Guangyao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fuli</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Xiangnan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Huanbo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Thanassis</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tat-Seng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wendy</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Cross-Domain Depression Detection via Harvesting Social Media</article-title>
          <year>2018</year>
          <conf-name>Twenty-Seventh International Joint Conference on Artificial Intelligence</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Stockholm</conf-loc>
          <fpage>1611</fpage>
          <lpage>1617</lpage>
          <pub-id pub-id-type="doi">10.24963/ijcai.2018/223</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ke</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rui</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A survey on automatical construction methods of sentiment lexicons</article-title>
          <source>Acta Automatica Sinica</source>
          <year>2016</year>
          <volume>42</volume>
          <issue>4</issue>
          <fpage>495</fpage>
          <lpage>511</lpage>
          <pub-id pub-id-type="doi">10.16383/j.aas.2016.c150585</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blair-goldensohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Neylon</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hannan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Reis</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Mcdonald</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Reynar</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Building a Sentiment Summarizer for Local Service Reviews</article-title>
          <year>2008</year>
          <conf-name>WWW2008 Workshop: NLP in the Information Explosion Era (NLPIX 2008)</conf-name>
          <conf-date>2008</conf-date>
          <conf-loc>Beijing, China</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baccianella</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Esuli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sebastiani</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>SentiWordNet 3.0: An Enhanced Lexical Resource for Sentiment Analysis and Opinion Mining</article-title>
          <year>2010</year>
          <conf-name>International Conference on Language Resources and Evaluation</conf-name>
          <conf-date>2010</conf-date>
          <conf-loc>Valletta, Malta</conf-loc>
          <fpage>17</fpage>
          <lpage>23</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kanayama</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Nasukawa</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Fully Automatic Lexicon Expansion for DomainOriented Sentiment Analysis</article-title>
          <year>2006</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>2006</conf-date>
          <conf-loc>Sydney, Australia</conf-loc>
          <pub-id pub-id-type="doi">10.3115/1610075.1610125</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krestel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Siersdorfer</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Generating contextualized sentiment lexica based on latent topics and user ratings</article-title>
          <year>2013</year>
          <conf-name>The 24th ACM Conference on Hypertext and Social Media</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Paris, France</conf-loc>
          <fpage>129</fpage>
          <lpage>138</lpage>
          <pub-id pub-id-type="doi">10.1145/2481492.2481506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>CT</given-names>
            </name>
          </person-group>
          <article-title>On the construction of effective vocabularies for information retrieval</article-title>
          <source>SIGPLAN Not</source>
          <year>1975</year>
          <month>01</month>
          <day>01</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>48</fpage>
          <lpage>60</lpage>
          <pub-id pub-id-type="doi">10.1145/951787.951766</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Fox</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Extended Boolean information retrieval</article-title>
          <source>Commun. ACM</source>
          <year>1983</year>
          <volume>26</volume>
          <issue>11</issue>
          <fpage>1022</fpage>
          <lpage>1036</lpage>
          <pub-id pub-id-type="doi">10.1145/182.358466</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Distributed Representations of Words and Phrases and their Compositionality</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2013</year>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cordasco</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gargano</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Community Detection via Semi-Synchronous Label Propagation Algorithms</article-title>
          <source>Int. J. of Social Network Mining</source>
          <year>2012</year>
          <pub-id pub-id-type="doi">10.1109/basna.2010.5730298</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boldi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rosa</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Santini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vigna</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Layered label propagation: a multiresolution coordinate-free ordering for compressing social networks</article-title>
          <year>2011</year>
          <conf-name>The 20th International Conference on World Wide Web</conf-name>
          <conf-date>April 01, 2011</conf-date>
          <conf-loc>Hyderabad, India</conf-loc>
          <pub-id pub-id-type="doi">10.1145/1963405.1963488</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huijie</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liqiang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Guangyao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tat-Seng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>What does social media say about your stress</article-title>
          <source>Proceedings of the Twenty-Fifth International Joint Conference on Artificial Intelligence</source>
          <year>2016</year>
          <conf-name>The Twenty-Fifth International Joint Conference on Artificial Intelligence</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>New York, USA</conf-loc>
          <fpage>3775</fpage>
          <lpage>3781</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yusi</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yuntao</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Domain specific Chinese word segmentation</article-title>
          <source>Computer Engineering and Applications</source>
          <year>2018</year>
          <volume>54</volume>
          <issue>17</issue>
          <fpage>30</fpage>
          <lpage>34</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="web">
          <source>CNKI</source>
          <access-date>2020-04-15</access-date>
          <comment>CNKI tool library<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://mall.cnki.net/reference/index.aspx">http://mall.cnki.net/reference/index.aspx</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="web">
          <source>BOSON</source>
          <access-date>2018-09-01</access-date>
          <comment>BosonNLP Dictionary<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bosonnlp.com/dev/resource">https://bosonnlp.com/dev/resource</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="web">
          <source>GitHub</source>
          <access-date>2018-09-01</access-date>
          <comment>jieba<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/fxsjy/jieba">https://github.com/fxsjy/jieba</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhe</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Renfen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wensi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xiaoyong</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Analogical Reasoning on Chinese Morphological and Semantic Relations</article-title>
          <year>2018</year>
          <conf-name>The 56th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Melbourne, Australia</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="web">
          <source>GitHub</source>
          <access-date>2018-09-01</access-date>
          <comment>Gensim – Topic Modelling in Python<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/RaRe-Technologies/gensim">https://github.com/RaRe-Technologies/gensim</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennebaker</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Francis</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Booth</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <source>Linguistic inquiry and word count (LIWC)</source>
          <year>1999</year>
          <publisher-loc>Mahwah, NJ</publisher-loc>
          <publisher-name>Erlbaum Publishers</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dinkel</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Text-based Depression Detection: What Triggers An Alert</article-title>
          <source>arXiv e-prints</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1904.05154"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
