<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i7e33678</article-id>
      <article-id pub-id-type="pmid">35862172</article-id>
      <article-id pub-id-type="doi">10.2196/33678</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Classification of Twitter Vaping Discourse Using BERTweet: Comparative Deep Learning Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chen</surname>
            <given-names>Shi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Doan</surname>
            <given-names>Son</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhou</surname>
            <given-names>Xinyu</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kim</surname>
            <given-names>Seongsoon</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Baker</surname>
            <given-names>William</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1013-3491</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Colditz</surname>
            <given-names>Jason B</given-names>
          </name>
          <degrees>MEd, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2811-841X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Dobbs</surname>
            <given-names>Page D</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <address>
            <institution>Health, Human Performance and Recreation Department</institution>
            <institution>University of Arkansas</institution>
            <addr-line>308A HPER Building</addr-line>
            <addr-line>Fayetteville, AR, 72701</addr-line>
            <country>United States</country>
            <phone>1 476 575 8680</phone>
            <email>pdobbs@uark.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1913-6488</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Mai</surname>
            <given-names>Huy</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4945-4316</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Visweswaran</surname>
            <given-names>Shyam</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2079-8684</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Zhan</surname>
            <given-names>Justin</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8991-5669</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Primack</surname>
            <given-names>Brian A</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5962-0939</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science and Computer Engineering</institution>
        <institution>University of Arkansas</institution>
        <addr-line>Fayetteville, AR</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Division of General Internal Medicine</institution>
        <institution>University of Pittsburgh School of Medicine</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Health, Human Performance and Recreation Department</institution>
        <institution>University of Arkansas</institution>
        <addr-line>Fayetteville, AR</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>University of Pittsburgh</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>College of Public Health and Human Science</institution>
        <institution>Oregon State University</institution>
        <addr-line>Corvallis, OR</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Page D Dobbs <email>pdobbs@uark.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>7</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>21</day>
        <month>7</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>7</issue>
      <elocation-id>e33678</elocation-id>
      <history>
        <date date-type="received">
          <day>18</day>
          <month>9</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>30</day>
          <month>12</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>21</day>
          <month>3</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>8</day>
          <month>5</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©William Baker, Jason B Colditz, Page D Dobbs, Huy Mai, Shyam Visweswaran, Justin Zhan, Brian A Primack. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 21.07.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/7/e33678" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Twitter provides a valuable platform for the surveillance and monitoring of public health topics; however, manually categorizing large quantities of Twitter data is labor intensive and presents barriers to identify major trends and sentiments. Additionally, while machine and deep learning approaches have been proposed with high accuracy, they require large, annotated data sets. Public pretrained deep learning classification models, such as BERTweet, produce higher-quality models while using smaller annotated training sets.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to derive and evaluate a pretrained deep learning model based on BERTweet that can identify tweets relevant to vaping, tweets (related to vaping) of commercial nature, and tweets with provape sentiment. Additionally, the performance of the BERTweet classifier will be compared against a long short-term memory (LSTM) model to show the improvements a pretrained model has over traditional deep learning approaches.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Twitter data were collected from August to October 2019 using vaping-related search terms. From this set, a random subsample of 2401 English tweets was manually annotated for relevance (vaping related or not), commercial nature (commercial or not), and sentiment (positive, negative, or neutral). Using the annotated data, 3 separate classifiers were built using BERTweet with the default parameters defined by the Simple Transformer application programming interface (API). Each model was trained for 20 iterations and evaluated with a random split of the annotated tweets, reserving 10% (n=165) of tweets for evaluations.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The relevance, commercial, and sentiment classifiers achieved an area under the receiver operating characteristic curve (AUROC) of 94.5%, 99.3%, and 81.7%, respectively. Additionally, the weighted F1 scores of each were 97.6%, 99.0%, and 86.1%, respectively. We found that BERTweet outperformed the LSTM model in the classification of all categories.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Large, open-source deep learning classifiers, such as BERTweet, can provide researchers the ability to reliably determine if tweets are relevant to vaping; include commercial content; and include positive, negative, or neutral content about vaping with a higher accuracy than traditional natural language processing deep learning models. Such enhancement to the utilization of Twitter data can allow for faster exploration and dissemination of time-sensitive data than traditional methodologies (eg, surveys, polling research).</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>vaping</kwd>
        <kwd>social media</kwd>
        <kwd>deep learning</kwd>
        <kwd>transformer models</kwd>
        <kwd>infoveillance</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Since its launch in 2006, Twitter has exploded in popularity to become one of the top social media platforms. As of 2021, the site hosts 192 million daily active users worldwide [<xref ref-type="bibr" rid="ref1">1</xref>]. The 280-character constraint on a Twitter text post, called a tweet, lends itself well to spontaneous and organic interactions. The candid nature of the tweets provides invaluable data for the public health realm. Patients spend relatively little time with health care professionals, with some only seeing their primary care physician once every other year, and therefore it can be difficult for health care workers to accurately address needs or feelings that patients often find uncomfortable disclosing to others [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
        <p>While Twitter provides a valuable platform for the surveillance and monitoring of public health topics, manually categorizing large quantities of Twitter data by hand presents challenges to identify major trends and sentiments in a timely manner. Machine and deep learning methods have previously been proposed to provide a framework for systematic and automated processing and analysis of Twitter data to develop surveillance systems with applications to public health [<xref ref-type="bibr" rid="ref3">3</xref>]. While these models achieve high accuracy, they require large sets of annotated data to be trained. By contrast, public pretrained deep learning classification models, such as BERTweet, produce higher-quality models while using smaller annotated training sets [<xref ref-type="bibr" rid="ref4">4</xref>]. In this study, we derive and evaluate a pretrained deep learning model based on BERTweet that can identify tweets relevant to vaping, tweets of commercial nature, and tweets with provape sentiment. We compare the results of the BERTweet-based classifier with a long short-term memory model (LSTM) to show the improvements a pretrained model has over traditional deep learning approaches.</p>
      </sec>
      <sec>
        <title>Traditional Deep Learning</title>
        <p>Deep learning is a class of machine learning algorithms that uses multiple layers to progressively extract higher-level features from raw input [<xref ref-type="bibr" rid="ref4">4</xref>]. Several types of deep learning architectures exist, such as deep neural networks, recurrent neural networks (RNNs), and convolutional neural networks (CNNs). Applications of deep learning include computer vision, speech recognition, natural language processing, and drug design.</p>
        <p>In their work, Visweswaran et al [<xref ref-type="bibr" rid="ref3">3</xref>] found that LSTM models performed particularly well on tweet classification for relevance, sentiment, and commercial nature [<xref ref-type="bibr" rid="ref3">3</xref>]. An LSTM network is a special kind of RNN capable of learning long-term dependencies [<xref ref-type="bibr" rid="ref5">5</xref>]. Unlike standard feedforward networks, such as CNNs, LSTMs have a feedback connection. This feedback connection allows the network to not only process a single data point (ie, a word), but also entire sequences of data (ie, sentence or phrase), which make them extremely powerful in classifying sentiment of a message.</p>
      </sec>
      <sec>
        <title>Pretrained Transformer Models</title>
        <p>Over the last few years, transformer models have been very effective for a large variety of natural language processing tasks. First proposed by Colditz et al [<xref ref-type="bibr" rid="ref6">6</xref>], transformers use a self-attention mechanism to capture what aspects of a sequence are important in a series of tokens. In simple terms, self-attention mechanisms aim to create real natural language understanding in machines.</p>
        <p>In 2018, Google AI Language released the Bidirectional Encoder Representations from Transformers (BERT) model, which improves upon the original transformer model by learning token representations in both directions [<xref ref-type="bibr" rid="ref7">7</xref>]. In normal transformers, a sequence is analyzed either left to right or right to left, but not in both directions. To achieve this, BERT uses a revamped pretraining procedure that includes masked language model and next sentence prediction objectives [<xref ref-type="bibr" rid="ref2">2</xref>]. Several BERT models pretrained on a variety of texts, languages, and topics are available freely to the public. This creates a ready-made approach for researchers trying to create models for a number of language tasks, including text classification. Researchers can use BERT in its default settings, or they can apply fine-tuning on a data set closely applicable to the task at hand. For instance, in this study, the created model is fine-tuned on a set of hand-annotated tweets before testing the classification accuracy of the system.</p>
        <p>After BERT was introduced, the “Robustly optimized BERT pre-training approach” (RoBERTa) was published [<xref ref-type="bibr" rid="ref8">8</xref>]. RoBERTa was created out of the authors’ experimentation with the default hyperparameters of BERT. They found that BERT was significantly undertrained, and that with some minor changes, the modified BERT model was able to outperform newer and even larger transformer models. Pretraining optimizations in RoBERTa include dynamic masking, large mini-batches, larger byte-pair encodings, and using full sentences across documents. We refer to Liu et al [<xref ref-type="bibr" rid="ref8">8</xref>] for a more detailed discussion of the optimizations performed in RoBERTa. Like BERT, many pretrained variations of RoBERTa are available online.</p>
        <p>BERTweet is a public BERT-based model trained using the RoBERTa pretraining procedure [<xref ref-type="bibr" rid="ref9">9</xref>]. Released in 2020, it was the first large-scale pretrained language model for English tweets to be released to other researchers for further improvements and novel applications. BERTweet was trained on 850 million English tweets collected from 2012 to 2019, which prepares it well for novel downstream classification tasks on a set of tweets. This pipeline of pretraining on a large text corpus and then fine-tuning the model for classification tasks is called transfer learning [<xref ref-type="bibr" rid="ref2">2</xref>]. It has been shown that pretraining is integral to model performance on downstream tasks, and it follows that pretraining a model on material that is similar to the texts in the downstream task will yield improved performance. Therefore, having access to a model trained on a large corpus of tweets is invaluable for the creation of a Twitter-based public health surveillance system. We refer to Nguyen et al [<xref ref-type="bibr" rid="ref9">9</xref>] for a more detailed explanation of how the BERTweet model functions.</p>
      </sec>
      <sec>
        <title>Objective</title>
        <p>It is our goal to produce an accurate BERTweet-based deep learning classifier that can improve upon existing Twitter surveillance systems that are focused on vaping-related tweets. Additionally, we aim to produce a classifier that is reliable and accurate in assessing a tweet for relevance (relevant or not), sentiment (positive, negative, or neutral), and commercial nature (commercial or not). Leveraging Twitter as a complement to traditional surveillance will allow for real-time identification of changes that can be used by public health practitioners. For example, when positive sentiment toward vaping rises, practitioners may be able to determine the exact reasons why and respond accordingly. Similarly, when there is a notable spike in misinformation about vaping and its effects on health, health experts will be able to act immediately to correct this information [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>Several works have proposed classifiers to classify Twitter data in terms of sentiment. Further, the last few years have seen a surge in publications on creating classifiers to analyze public health trends as depicted on Twitter. Gohil et al [<xref ref-type="bibr" rid="ref10">10</xref>] performed a review of current sentiment analysis tools available for researchers. They found that while multiple methods existed for analyzing the sentiment of tweets in the health care setting, there is still the need for an accurate and verified tool for sentiment analysis of tweets trained using a health care setting–specific tweet. Edara et al [<xref ref-type="bibr" rid="ref11">11</xref>] developed an LSTM to classify cancer-related tweets based on the tone of the tweet and compared the results against several traditional machine learning approaches. They found that the LSTM model outperformed all of the other approaches. Ji et al [<xref ref-type="bibr" rid="ref12">12</xref>] utilized the Twitter platform to monitor the spread of public concern about epidemics by separating personal tweets from new tweets and then further categorizing the personal tweets into those that are negative and nonnegative using a naïve Bayes classifier.</p>
        <p>For a general approach to performing a sentiment analysis on Twitter data, Agarwal et al [<xref ref-type="bibr" rid="ref13">13</xref>] introduced unigram, feature-based, and tree-based models to classify tweets as either a binary task (positive or negative) or a 3-way task (positive, negative, and neutral). Harjule et al [<xref ref-type="bibr" rid="ref14">14</xref>] proposed another general approach to classifying the sentiment of tweets. The authors analyzed several lexicon and machine learning–based tweet sentiment classifiers on a large group of data sets and found that the machine learning models were more accurate at classifying sentiment. Kharde and Sonawane [<xref ref-type="bibr" rid="ref15">15</xref>] performed a similar comparative analysis and verified the claim from Harjule et al [<xref ref-type="bibr" rid="ref14">14</xref>] that machine learning classifiers yield higher accuracy, with the caveat that lexicon-based methods can be more affective in some cases.</p>
        <p>Beyond general sentiment and public health monitoring, several studies have looked at using Twitter to monitor trends toward vaping and e-cigarettes [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Han and Kavuluru [<xref ref-type="bibr" rid="ref18">18</xref>] implemented several machine learning models, such as support vector machines, logistic regression, and CNNs, to identify marketing and nonmarketing e-cigarette tweets. Further, Myslín et al [<xref ref-type="bibr" rid="ref19">19</xref>] and Cole-Lewis et al [<xref ref-type="bibr" rid="ref20">20</xref>] annotated tobacco-related tweets and derived several machine learning classifiers to predict sentiment. Huang et al [<xref ref-type="bibr" rid="ref21">21</xref>] analyzed tweets using machine learning classifiers to find trend in the commercial nature of tweets relating to vaping. They found that tweets related to e-cigarettes were about 90% commercial and about 10% mentioned smoking cessation. Resende and Culotta [<xref ref-type="bibr" rid="ref22">22</xref>] derived a sentiment classifier for e-cigarette–related tweets that identified positive and negative tweets with 96% and 70% precision, respectively. Visweswaran et al [<xref ref-type="bibr" rid="ref3">3</xref>] performed an in-depth comparison of traditional machine learning classifiers (regression, random forest, linear support vector machine, and multinomial naïve Bayes) with deep learning classifiers (CNN, LSTM, LSTM-CNN, and bidirectional LSTM), and found that among all the tested networks, LSTM achieved the highest classification accuracy.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>Tweets were collected continuously from August to October 2019 using the Real-Time Infoveillance of Twitter Health Messages (RITHM) framework [<xref ref-type="bibr" rid="ref6">6</xref>]. The RITHM framework is an open-source software for collecting and formatting Twitter data. It additionally provides procedures for maximizing the efficiency and effectiveness of subsequent human data coding. The keywords that we used for data collection include <italic>vape, vapes, vaper, vapers, vaping, juul, juuls,</italic> and <italic>juuling</italic>. The vaping-related keywords are based on previous Twitter research [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref10">10</xref>] and, in particular, we included keywords to identify the highly popular e-cigarette brand, JUUL, which had the highest market share at the time from which data were collected [<xref ref-type="bibr" rid="ref23">23</xref>]. We identified and collected all tweets that matched 1 or more keywords from the list above.</p>
      </sec>
      <sec>
        <title>Annotation</title>
        <p>After data collection, a random subsample of 2401 English tweets was annotated for relevance (vaping related or not), commercial nature (commercial or not), and sentiment (positive, negative, or neutral). This annotation was done in accordance with the 3-level hierarchical annotation schema, as depicted in <xref ref-type="table" rid="table1">Table 1</xref>. A tweet was first annotated for relevance. Then, only if the tweet was relevant, was it annotated for commercial nature and sentiment.</p>
        <p>A team of 2 trained annotators independently annotated batches of 400 tweets at a time. Adjudicated annotation disagreements were carried out under the presence of the supervising investigator. All annotates codes have a Cohen κ value over 0.70, indicating strong internal agreement among annotators. The full set of 2401 adjudicated annotations and tweet content were used in the training of the classifier models. A detailed description of the annotations can be found in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Descriptions of labels used for annotating vaping-related tweets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="380"/>
            <col width="460"/>
            <thead>
              <tr valign="top">
                <td>Labels</td>
                <td>Descriptions</td>
                <td>Example quotes</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Relevant</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Is the tweet in English and related to the vaping topic at hand (eg, vape use or users, vaping devices, or products)?</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Not relevant</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Typically, non-English tweets or tweets that referenced vaping cannabis products specifically.</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Commercial</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Is the tweet an advertisement/marketing for vaping products?</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>
                        <italic>Today only! Buy one JUUL get the second half price with our online coupon code #JUUL4LIFE</italic>
                      </p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Noncommercial</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Includes tweets that demonstrate favorability toward a product but do not directly advocate for purchasing it.</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Positive</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The tweet is associated with positive emotions or contexts regarding vaping.</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The tweeter is currently, or has recently used, or is going to vape:</p>
                      <list>
                        <list-item>
                          <p>
                            <italic>Currently juuling in the bathroom at school!</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                    <list-item>
                      <p>The tweeter shows positivity or neutral acceptance from others’ usage or others’ positive comments about vaping:</p>
                      <list>
                        <list-item>
                          <p>
                            <italic>Just got Hannah to try vaping for the first time! She loved it.</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                    <list-item>
                      <p>The tweeter mentions a vape pen in association with other positive aspects of society or popular culture.</p>
                      <list>
                        <list-item>
                          <p>
                            <italic>We need a Disney princess that rips her JUUL in the middle of a serious conversation.</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                    <list-item>
                      <p>The tweeter asks a question using first-person pronouns:</p>
                      <list>
                        <list-item>
                          <p>
                            <italic>Where can I buy a JUUL?</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Negative</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The tweet is associated with negative emotions or contexts regarding vaping.</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The tweeter believes smoking a vape is disgusting, uncool, or unattractive:</p>
                      <list>
                        <list-item>
                          <p>
                            <italic>Cannot believe everyone is smoking JUULs these days. I think it’s disgusting.</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                    <list-item>
                      <p>The tweeter criticizes/ridicules others for using a vape:</p>
                      <list>
                        <list-item>
                          <p>
                            <italic>ur mcm says ‘cigarettes are gross’ yet is addicted to nicotine through cool cucumber flavored JUUL pods.</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                    <list-item>
                      <p>The tweeter prefers to use a different substance, such as cigarettes or marijuana:</p>
                      <list list-type="bullet">
                        <list-item>
                          <p>
                            <italic>Tried a JUUL today for the first time but I still prefer cigarettes over it.</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                  </list>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Neutral</td>
                <td>
                  <break/>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>The tweet is factual but not opinionated or is a question about unbiased facts/information about vaping:</p>
                      <list list-type="bullet">
                        <list-item>
                          <p>
                            <italic>They are selling JUUL pens at my local tobacco shop for anyone interested.</italic>
                          </p>
                        </list-item>
                        <list-item>
                          <p>
                            <italic>What is a JUUL?</italic>
                          </p>
                        </list-item>
                        <list-item>
                          <p>
                            <italic>Is a JUUL better than tobacco?</italic>
                          </p>
                        </list-item>
                      </list>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Description of annotated training and test data sets (N=2401).<sup>a</sup></p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="280"/>
            <col width="290"/>
            <col width="270"/>
            <thead>
              <tr valign="top">
                <td>Targets</td>
                <td>Number of tweets with a positive target, n (%)</td>
                <td>Number of tweets with a negative target, n (%)</td>
                <td>Number of tweets with a neutral target, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Relevance</td>
                <td>Relevant:<break/><list list-type="bullet"><list-item><p>Total: 1802 (75.05)</p></list-item><list-item><p>Training: 1637 (90.84)</p></list-item><list-item><p>Test: 165 (9.16)</p></list-item></list></td>
                <td>Nonrelevant:<break/><list list-type="bullet"><list-item><p>Total: 599 (24.95)</p></list-item><list-item><p>Training: 524 (87.48)</p></list-item><list-item><p>Test: 75 (12.52)</p></list-item></list></td>
                <td>N/A<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>Commercial</td>
                <td>Commercial:<break/><list list-type="bullet"><list-item><p>Total: 117 (4.87)</p></list-item><list-item><p>Training: 106 (90.60)</p></list-item><list-item><p>Test: 11(9.40)</p></list-item></list></td>
                <td>Noncommercial:<break/><list list-type="bullet"><list-item><p>Total: 1685 (70.18)</p></list-item><list-item><p>Training: 1516 (89.97)</p></list-item><list-item><p>Test: 169 (10.03)</p></list-item></list></td>
                <td>N/A</td>
              </tr>
              <tr valign="top">
                <td>Sentiment</td>
                <td>Positive:<break/><list list-type="bullet"><list-item><p>Total: 172 (7.16)</p></list-item><list-item><p>Training: 158 (91.86)</p></list-item><list-item><p>Test: 14 (8.14)</p></list-item></list></td>
                <td>Negative:<break/><list list-type="bullet"><list-item><p>Total: 130 (5.41)</p></list-item><list-item><p>Training: 119 (91.54)</p></list-item><list-item><p>Test: 11 (8.46)</p></list-item></list></td>
                <td>Neutral:<break/><list list-type="bullet"><list-item><p>Total: 1372 (57.14)</p></list-item><list-item><p>Training: 1229 (89.58)</p></list-item><list-item><p>Test: 143 (10.42)</p></list-item></list></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Percentages may not add up to 100% as classification was made for sentiment only if the tweet was relevant.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Sentiment-only code with neutral target.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>LSTM Model</title>
        <p>We will briefly recount the process explained in Visweswaran et al [<xref ref-type="bibr" rid="ref3">3</xref>] to train and evaluate an LSTM model to classify a tweet related to vaping as relevant; commercial; and if it was positive, negative, or neutral in sentiment. Our LSTM model was developed using the built-in functionality of the TensorFlow machine learning library. We utilized rectified linear unit (ReLU) as the activation function of the hidden layers and the sigmoid activation function for the output layer. Additionally, we utilized binary cross entropy as the loss function with Adam as the optimizer. In accordance with Visweswaran et al’s study [<xref ref-type="bibr" rid="ref3">3</xref>], we used nondomain-specific GloVe word vectors.</p>
        <p>After first testing a 70/30 split to create the relevance classifier and testing random splits to prevent over fitting, we found optimal results with a 90/10 split of the entire annotated data set, as all tweets were coded as either relevant or nonrelevant. We used the 90% split (n=1637) to train the LSTM relevance classifier, and then tested on the remaining 10% (n=165). We trained the model for 5 epochs using a batch size of 64. Both the commercial and sentiment classifiers followed the same training and testing procedures as the relevance classifier. The one difference being that only tweets labeled as relevant were used in the commercial and sentiment data sets. All nonrelevant tweets were filtered out and discarded.</p>
      </sec>
      <sec>
        <title>BERTweet</title>
        <p>To create a classifier for relevance, 90% of the tweets labeled as either relevant (n=1637) or nonrelevant (n=524) were used to fine-tune the BERTweet model, and the remaining 10% were used to test the final model (relevant n=165; nonrelevant n=75). This splitting, training, and testing process was repeated multiple times with random splits, and the accuracy results are the averages of each individual run. BERTweet was trained for 20 epochs with a batch size of 32 and a learning rate of 5 × 10<sup>–5</sup>. All other hyperparameters were left to the default values according to Simple Transformers API, which was used to accelerate the fine-tuning process for BERTweet and decrease the amount of proprietary code needed to be written. Tokenization of input tweet text was handled by Simple Transformers API, which automatically uses the BERTweet tokenizer defined by the creators of the model.</p>
        <p>To create the commercial and sentiment classifiers, annotated tweets were first filtered by relevance; nonrelevant tweets were discarded for these classifiers, and tweets marked relevant were then split into training and testing sets, and models were fine-tuned using the same process as the relevance classifier.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>We compared the performance of the LSTM and BERTweet classifiers in terms of F1 and AUROC scores. Additionally, each score is the average of 3 different testing iterations of the respective models. F1 is a function of precision and recall:</p>
        <p>
          <disp-formula>F1 = 2×(Precision × Recall)/(Precision + Recall) (1)</disp-formula>
        </p>
        <p>
          <disp-formula>Precision = True positive/(True positive + False positive) (2)</disp-formula>
        </p>
        <p>
          <disp-formula>Recall = True positive/(True positive + False negative) (3)</disp-formula>
        </p>
        <p>For F1, values closer to 1 on a scale of 0 to 1 indicate good balance between precision and recall.</p>
        <p>AUROC is the measure of the discrimination of the models, that is, for example, how well a classifier differentiates between positive, negative, and neutral tweets. The larger the AUROC score is, the better the model performs.</p>
      </sec>
      <sec>
        <title>Relevance</title>
        <p>With regard to classifying a tweet as relevant or nonrelevant, the BERTweet classifier obtained an F1 score of 0.976 and an AUROC score of 0.945. The LSTM classifier achieved an F1 score of 0.924 and an AUROC score of 0.924. All runs of the BERTweet classifier achieved higher F1 and AUROC scores than the LSTM model.</p>
      </sec>
      <sec>
        <title>Commercial</title>
        <p>In classifying commercial tweets (commercial or noncommercial) the BERTweet classifier performed well with an F1 score of 0.990 and an AUROC of 0.993. Of all classes, the BERTweet performed best in commercial classification. The LSTM model produced a lower F1 score of 0.727 and a lower AUROC score of 0.903 in comparison to the BERTweet model (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Comparison of BERTweet and LSTM<sup>a</sup> F1 and AUROC<sup>b</sup> scores.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="270"/>
            <col width="0"/>
            <col width="250"/>
            <col width="0"/>
            <col width="270"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Classifier/metric</td>
                <td colspan="2">Relevance</td>
                <td>Commercial</td>
                <td>Sentiment</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">
                  <bold>BERTweet</bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="2">
                  <break/>
                </td>
                <td>F1</td>
                <td colspan="2">0.976</td>
                <td colspan="2">0.990</td>
                <td>0.861</td>
              </tr>
              <tr valign="top">
                <td>AUROC</td>
                <td colspan="2">0.945</td>
                <td colspan="2">0.993</td>
                <td>0.817</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>LSTM</bold>
                </td>
                <td colspan="2">
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td rowspan="2">
                  <break/>
                </td>
                <td>F1</td>
                <td colspan="2">0.924</td>
                <td colspan="2">0.727</td>
                <td>0.250</td>
              </tr>
              <tr valign="top">
                <td>AUROC</td>
                <td colspan="2">0.924</td>
                <td colspan="2">0.903</td>
                <td>0.776</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>AUROC: area under the receiver operating characteristic curve.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Sentiment</title>
        <p>Both the BERTweet and LSTM models performed the worst in the classification of sentiment (positive, negative, or neutral). BERTweet obtained an F1 of 0.861 with an AUROC of 0.817. The LSTM model had an F1 of 0.250 with an AUROC of 0.776.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This is the first study to use BERTweet to classify vaping-related tweets. Based on the analyses, we found that pretrained deep learning classifiers such as BERTweet perform exceptionally well at classifying a tweet as being relevant to vaping, being a commercial-natured tweet about vaping, as well as the sentiment of a tweet toward vaping. Compared with the LSTM classifier, the BERTweet classifier had AUROC values of 0.945, 0.993, and 0.817 for relevance, commercial nature, and sentiment, respectively. In general, these results show that pretrained classifiers can be utilized to monitor social medial platforms such as Twitter for public health trends. Such enhancement to the utilization of Twitter data can allow for faster exploration and dissemination of time-sensitive data than traditional methodologies such as surveys and polling research.</p>
        <p>Practically, our work also serves to provide public health practitioners with vaping-related information on Twitter. For example, if there is an increase in positive sentiments of tweets, public health practitioners may find that a particular area is ready for policy change. Using the classification results, practitioners can also understand how many tweets are related to marketing of vaping and the relationship between sentiment of people and number of commercial tweets.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study was performed with several limitations. First, a relatively small set of 2401 tweets was annotated by hand. Compared with another study [<xref ref-type="bibr" rid="ref3">3</xref>], this was just over half the size of the data set they annotated. While the set was small, it was enough to produce accurate results when using BERTweet, which is another testament to the power that pretrained transformer models have. However, this limitation does make it difficult to compare results directly with Visweswaran et al [<xref ref-type="bibr" rid="ref3">3</xref>]. Second, while we matched keywords with Visweswaran et al’s study [<xref ref-type="bibr" rid="ref3">3</xref>], due to the evolving nature of language on Twitter, our collection methods could have overlooked new products or trends that have become prevalent on the Twitter platform. Third, we analyzed tweets that were written in English only. This limits the populations from which this classifier can accurately classify tweets. For instance, other countries may have different sentiments toward vaping that were not supported in this study. Finally, the date range of the tweets was limited to a 2-month time span, which limits the generalizability of the classifier over time, and therefore, more analysis would need to be performed to discover the longevity of the classifier.</p>
      </sec>
      <sec>
        <title>Future Research</title>
        <p>Several different research endeavors relating to utilizing pretrained deep learning models to classifying tweets could be explored. First, we could expand from analyzing only English tweets to diversify this work for global regions and languages. Additionally, analysis on the number of annotated tweets needed to create an equivalent LSTM model could be performed to give substantial evidence that pretrained models provide evidence just beyond higher classification accuracy. Finally, the BERTweet model developed in this paper could be extended to create a real-time analysis platform for sentiment toward vaping to better inform public health officials, allowing them to understand the impacts of current and future policy interventions.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>In this study, we produced a deep learning classification model based on BERTweet that was able to classify a vaping-related tweet along several viewpoints such as relevance (relevant or not), commercial nature (commercial or not), and sentiment (positive, negative, or neutral). We then compared the classification performance of the BERTweet model with that of an LSTM model for the classification of 2401 hand-coded tweets. We found that in all classification cases BERTweet achieved higher levels of accuracy. The strong performance of BERTweet shows that it can increase the ability to accurately monitor social platforms such as Twitter with regard to public health trends such as vaping.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ReLU</term>
          <def>
            <p>rectified linear unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">RITHM</term>
          <def>
            <p>Real-time Time Infoveillance of Twitter Health Messages</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">RNN</term>
          <def>
            <p>recurrent neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">RoBERTa</term>
          <def>
            <p>robustly optimized BERT pre-training approach</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors thank Eric Schisler, Caroline Stokenbury, and Emily Abby Norton for data annotation. This work was supported by awards from the National Cancer Institute of the National Institutes of Health (R01-CA225773), the National Library of Medicine of the National Institutes of Health (R01- LM012095), and the National Science Foundation (ACI-1548562 and ACI-1445606 to the Pittsburgh Supercomputing Center). The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health or the National Science Foundation.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>10 Twitter Statistics Every Marketer Should Know in 2021</article-title>
          <source>Infographic</source>
          <access-date>2021-04-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.oberlo.com/blog/twitter-statistics">https://www.oberlo.com/blog/twitter-statistics</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baker</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Using Large Pre-Trained Language Models to Track Emotions of Cancer Patients on Twitter</article-title>
          <source>Computer Science and Computer Engineering Undergraduate Honors Theses</source>
          <access-date>2022-05-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://scholarworks.uark.edu/csceuht/92/">https://scholarworks.uark.edu/csceuht/92/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Visweswaran</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Colditz</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>O'Halloran</surname>
              <given-names>Patrick</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Taneja</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sidani</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Primack</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>Machine learning classifiers for Twitter surveillance of vaping: comparative machine learning study</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>08</month>
          <day>12</day>
          <volume>22</volume>
          <issue>8</issue>
          <fpage>e17478</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/8/e17478/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17478</pub-id>
          <pub-id pub-id-type="medline">32784184</pub-id>
          <pub-id pub-id-type="pii">v22i8e17478</pub-id>
          <pub-id pub-id-type="pmcid">PMC7450367</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>LeCun</surname>
              <given-names>Yann</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Yoshua</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>Geoffrey</given-names>
            </name>
          </person-group>
          <article-title>Deep learning</article-title>
          <source>Nature</source>
          <year>2015</year>
          <month>05</month>
          <day>28</day>
          <volume>521</volume>
          <issue>7553</issue>
          <fpage>436</fpage>
          <lpage>44</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://en.wikipedia.org/wiki/Deep_learning"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/nature14539</pub-id>
          <pub-id pub-id-type="medline">26017442</pub-id>
          <pub-id pub-id-type="pii">nature14539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Rui</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>Ruqiang</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Jinjiang</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>Kezhi</given-names>
            </name>
          </person-group>
          <article-title>Learning to monitor machine health with convolutional bi-directional LSTM networks</article-title>
          <source>Sensors</source>
          <year>2017</year>
          <month>01</month>
          <day>30</day>
          <volume>17</volume>
          <issue>2</issue>
          <fpage>273</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=s17020273"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/s17020273</pub-id>
          <pub-id pub-id-type="medline">28146106</pub-id>
          <pub-id pub-id-type="pii">s17020273</pub-id>
          <pub-id pub-id-type="pmcid">PMC5336098</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Colditz</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Emery</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Larkin</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>James</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Primack</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>Toward real-time infoveillance of Twitter health messages</article-title>
          <source>Am J Public Health</source>
          <year>2018</year>
          <month>08</month>
          <volume>108</volume>
          <issue>8</issue>
          <fpage>1009</fpage>
          <lpage>1014</lpage>
          <pub-id pub-id-type="doi">10.2105/ajph.2018.304497</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2019</year>
          <conf-name>Association for Computational Linguistics</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Minneapolis, Minnesota</conf-loc>
          <fpage>4171</fpage>
          <lpage>4186</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/N19-1423"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/n18-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stoyanov</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Roberta: A robustly optimized bert pre-training approach</article-title>
          <source>ArXiv 0219:abs/1907.1</source>
          <year>2019</year>
          <fpage>1692</fpage>
          <pub-id pub-id-type="doi">doi.org/10.48550/arXiv.1907.11692</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>BERTweet: A Pre-trained Language Model for English Tweets</article-title>
          <source>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</source>
          <year>2020</year>
          <conf-name>Association for Computational Linguistics</conf-name>
          <conf-date>July 5-10, 2020</conf-date>
          <conf-loc>Virtual</conf-loc>
          <publisher-loc>BERTweet</publisher-loc>
          <publisher-name>A Pre-trained Language Model for English Tweets. Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processingystem Demonstrations. Association for Computational Linguistics 2020 Oct</publisher-name>
          <fpage>A</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/2020.emnlp-demos.2/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-demos.2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gohil</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vuik</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Darzi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Sentiment analysis of health care tweets: review of the methods used</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2018</year>
          <month>04</month>
          <day>23</day>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>e43</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2018/2/e43/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/publichealth.5789</pub-id>
          <pub-id pub-id-type="medline">29685871</pub-id>
          <pub-id pub-id-type="pii">v4i2e43</pub-id>
          <pub-id pub-id-type="pmcid">PMC5938573</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Edara</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Vanukuri</surname>
              <given-names>LP</given-names>
            </name>
            <name name-style="western">
              <surname>Sistla</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kolli</surname>
              <given-names>VKK</given-names>
            </name>
          </person-group>
          <article-title>Sentiment analysis and text categorization of cancer medical records with LSTM</article-title>
          <source>J Ambient Intell Human Comput</source>
          <year>2019</year>
          <month>7</month>
          <day>16</day>
          <fpage>1</fpage>
          <pub-id pub-id-type="doi">10.1007/s12652-019-01399-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chun</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Geller</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Twitter sentiment classification for measuring public health concerns</article-title>
          <source>Soc Netw Anal Min</source>
          <year>2015</year>
          <month>5</month>
          <day>12</day>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>13</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32226558"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s13278-015-0253-5</pub-id>
          <pub-id pub-id-type="medline">32226558</pub-id>
          <pub-id pub-id-type="pii">253</pub-id>
          <pub-id pub-id-type="pmcid">PMC7096866</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Vovsha</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Rambow</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Passonneau</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Sentiment Analysis of Twitter Data</article-title>
          <source>Department of Computer Science Columbia University</source>
          <year>2011</year>
          <conf-name>In Proceedings of the workshop on language in social media</conf-name>
          <conf-date>June 2011</conf-date>
          <conf-loc>New York, NY</conf-loc>
          <fpage>30</fpage>
          <lpage>38</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cs.columbia.edu/~julia/papers/Agarwaletal11.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harjule</surname>
              <given-names>Priyanka</given-names>
            </name>
            <name name-style="western">
              <surname>Gurjar</surname>
              <given-names>Astha</given-names>
            </name>
            <name name-style="western">
              <surname>Seth</surname>
              <given-names>Harshita</given-names>
            </name>
            <name name-style="western">
              <surname>Thakur</surname>
              <given-names>Priya</given-names>
            </name>
          </person-group>
          <article-title>Text Classification on Twitter Data</article-title>
          <year>2020</year>
          <conf-name>3rd International Conference on Emerging Technologies in Computer Engineering: Machine Learning and Internet of Things</conf-name>
          <conf-date>February 7-8, 2020</conf-date>
          <conf-loc>Jaipur, India</conf-loc>
          <fpage>160</fpage>
          <lpage>164</lpage>
          <pub-id pub-id-type="doi">10.1109/ICETCE48199.2020.9091774</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharde</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Sonawane</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Sentiment analysis of Twitter data: a survey of techniques</article-title>
          <source>IJCA</source>
          <year>2016</year>
          <month>04</month>
          <day>15</day>
          <volume>139</volume>
          <issue>11</issue>
          <fpage>5</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.5120/ijca2016908625</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Colditz</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
            <name name-style="western">
              <surname>James</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Primack</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>World vaping day: contextualizing vaping culture in online social media using a mixed methods approach</article-title>
          <source>Journal of Mixed Methods Research</source>
          <year>2017</year>
          <month>04</month>
          <day>09</day>
          <volume>13</volume>
          <issue>2</issue>
          <fpage>196</fpage>
          <lpage>215</lpage>
          <pub-id pub-id-type="doi">10.1177/1558689817702753</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sidani</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Colditz</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Barrett</surname>
              <given-names>EL</given-names>
            </name>
            <name name-style="western">
              <surname>Shensa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>James</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Primack</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>I wake up and hit the JUUL: Analyzing Twitter for JUUL nicotine effects and dependence</article-title>
          <source>Drug and Alcohol Dependence</source>
          <year>2019</year>
          <month>11</month>
          <volume>204</volume>
          <fpage>107500</fpage>
          <pub-id pub-id-type="doi">10.1016/j.drugalcdep.2019.06.005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kavuluru</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Exploratory Analysis of Marketing and Non-Marketing E-cigarette Themes on Twitter</article-title>
          <year>2016</year>
          <month>11</month>
          <day>11</day>
          <conf-name>International Conference on Social Informatics</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>Bellevue</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-319-47874-6_22</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Myslín</surname>
              <given-names>Mark</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Conway</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Using Twitter to examine smoking behavior and perceptions of emerging tobacco products</article-title>
          <source>J Med Internet Res</source>
          <year>2013</year>
          <month>08</month>
          <day>29</day>
          <volume>15</volume>
          <issue>8</issue>
          <fpage>e174</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2013/8/e174/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.2534</pub-id>
          <pub-id pub-id-type="medline">23989137</pub-id>
          <pub-id pub-id-type="pii">v15i8e174</pub-id>
          <pub-id pub-id-type="pmcid">PMC3758063</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Varghese</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sanders</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pugatch</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Augustson</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Assessing electronic cigarette-related tweets for sentiment and content using supervised machine learning</article-title>
          <source>J Med Internet Res</source>
          <year>2015</year>
          <month>08</month>
          <day>25</day>
          <volume>17</volume>
          <issue>8</issue>
          <fpage>e208</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2015/8/e208/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.4392</pub-id>
          <pub-id pub-id-type="medline">26307512</pub-id>
          <pub-id pub-id-type="pii">v17i8e208</pub-id>
          <pub-id pub-id-type="pmcid">PMC4642404</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kornfield</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Szczypka</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Emery</surname>
              <given-names>SL</given-names>
            </name>
          </person-group>
          <article-title>A cross-sectional examination of marketing of electronic cigarettes on Twitter</article-title>
          <source>Tob Control</source>
          <year>2014</year>
          <month>07</month>
          <day>16</day>
          <volume>23 Suppl 3</volume>
          <issue>suppl 3</issue>
          <fpage>iii26</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://tobaccocontrol.bmj.com/lookup/pmidlookup?view=long&#38;pmid=24935894"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/tobaccocontrol-2014-051551</pub-id>
          <pub-id pub-id-type="medline">24935894</pub-id>
          <pub-id pub-id-type="pii">tobaccocontrol-2014-051551</pub-id>
          <pub-id pub-id-type="pmcid">PMC4078681</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Resende</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Culotta</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A Demographic and Sentiment Analysis of E-cigarette Messages on Twitter</article-title>
          <source>Computer Science Department, Illinois Institute of Technology</source>
          <year>2015</year>
          <conf-name>6th ACM Conference on Bioinformatics, Computational Biology, and Health Informatics</conf-name>
          <conf-date>September 9-12, 2015</conf-date>
          <conf-loc>Atlanta</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://cs.tulane.edu/~aculotta/pubs/resende15demographic.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Jidong</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>Zongshuan</given-names>
            </name>
            <name name-style="western">
              <surname>Kwok</surname>
              <given-names>Julian</given-names>
            </name>
            <name name-style="western">
              <surname>Binns</surname>
              <given-names>Steven</given-names>
            </name>
            <name name-style="western">
              <surname>Vera</surname>
              <given-names>Lisa E</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Yoonsang</given-names>
            </name>
            <name name-style="western">
              <surname>Szczypka</surname>
              <given-names>Glen</given-names>
            </name>
            <name name-style="western">
              <surname>Emery</surname>
              <given-names>Sherry L</given-names>
            </name>
          </person-group>
          <article-title>Vaping versus JUULing: how the extraordinary growth and marketing of JUUL transformed the US retail e-cigarette market</article-title>
          <source>Tob Control</source>
          <year>2019</year>
          <month>03</month>
          <volume>28</volume>
          <issue>2</issue>
          <fpage>146</fpage>
          <lpage>151</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://tobaccocontrol.bmj.com/lookup/pmidlookup?view=long&#38;pmid=29853561"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/tobaccocontrol-2018-054382</pub-id>
          <pub-id pub-id-type="medline">29853561</pub-id>
          <pub-id pub-id-type="pii">tobaccocontrol-2018-054382</pub-id>
          <pub-id pub-id-type="pmcid">PMC6274629</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
