<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i12e40102</article-id>
      <article-id pub-id-type="pmid">36534443</article-id>
      <article-id pub-id-type="doi">10.2196/40102</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Comparison of Methods for Estimating Temporal Topic Models From Primary Care Clinical Text Data: Retrospective Closed Cohort Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mircheva</surname>
            <given-names>Iskra</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>King</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Meaney</surname>
            <given-names>Christopher</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Dalla Lana School of Public Health, Division of Biostatistics</institution>
            <institution>University of Toronto</institution>
            <addr-line>155 College Street</addr-line>
            <addr-line>Toronto, ON, M5G1V7</addr-line>
            <country>Canada</country>
            <phone>1 4169785602</phone>
            <email>christopher.meaney@utoronto.ca</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5429-5233</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Escobar</surname>
            <given-names>Michael</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9055-4709</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Stukel</surname>
            <given-names>Therese A</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9283-8764</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Austin</surname>
            <given-names>Peter C</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3337-233X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Jaakkimainen</surname>
            <given-names>Liisa</given-names>
          </name>
          <degrees>MSc, MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3203-0007</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Dalla Lana School of Public Health, Division of Biostatistics</institution>
        <institution>University of Toronto</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Family and Community Medicine</institution>
        <institution>University of Toronto</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>ICES</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Institute of Health Policy, Management and Evaluation</institution>
        <institution>University of Toronto</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Christopher Meaney <email>christopher.meaney@utoronto.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>12</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>19</day>
        <month>12</month>
        <year>2022</year>
      </pub-date>
      <volume>10</volume>
      <issue>12</issue>
      <elocation-id>e40102</elocation-id>
      <history>
        <date date-type="received">
          <day>6</day>
          <month>6</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>31</day>
          <month>7</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>1</day>
          <month>9</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>18</day>
          <month>9</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Christopher Meaney, Michael Escobar, Therese A Stukel, Peter C Austin, Liisa Jaakkimainen. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 19.12.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2022/12/e40102" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Health care organizations are collecting increasing volumes of clinical text data. Topic models are a class of unsupervised machine learning algorithms for discovering latent thematic patterns in these large unstructured document collections.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aimed to comparatively evaluate several methods for estimating temporal topic models using clinical notes obtained from primary care electronic medical records from Ontario, Canada.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used a retrospective closed cohort design. The study spanned from January 01, 2011, through December 31, 2015, discretized into 20 quarterly periods. Patients were included in the study if they generated at least 1 primary care clinical note in each of the 20 quarterly periods. These patients represented a unique cohort of individuals engaging in high-frequency use of the primary care system. The following temporal topic modeling algorithms were fitted to the clinical note corpus: nonnegative matrix factorization, latent Dirichlet allocation, the structural topic model, and the BERTopic model.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Temporal topic models consistently identified latent topical patterns in the clinical note corpus. The learned topical bases identified meaningful activities conducted by the primary health care system. Latent topics displaying near-constant temporal dynamics were consistently estimated across models (eg, pain, hypertension, diabetes, sleep, mood, anxiety, and depression). Several topics displayed predictable seasonal patterns over the study period (eg, respiratory disease and influenza immunization programs).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Nonnegative matrix factorization, latent Dirichlet allocation, structural topic model, and BERTopic are based on different underlying statistical frameworks (eg, linear algebra and optimization, Bayesian graphical models, and neural embeddings), require tuning unique hyperparameters (optimizers, priors, etc), and have distinct computational requirements (data structures, computational hardware, etc). Despite the heterogeneity in statistical methodology, the learned latent topical summarizations and their temporal evolution over the study period were consistently estimated. Temporal topic models represent an interesting class of models for characterizing and monitoring the primary health care system.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>clinical text data</kwd>
        <kwd>temporal topic model</kwd>
        <kwd>nonnegative matrix factorization</kwd>
        <kwd>latent Dirichlet allocation</kwd>
        <kwd>structural topic model</kwd>
        <kwd>BERTopic</kwd>
        <kwd>text mining</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Primary Care Text Data</title>
        <p>Electronic medical record (EMR) systems are increasingly being adopted in clinical settings across the globe [<xref ref-type="bibr" rid="ref1">1</xref>]. As a result, health care organizations are generating, collecting, and digitally storing large volumes of routinely collected clinical information. In this study, we focused on clinical text data commonly collected in primary care EMR systems. We compared a class of unsupervised machine learning models—temporal topic models—used to characterize the latent thematic content of large document corpora and summarize latent topical dynamics over time. Temporal topic models have the potential to be applied to large unstructured clinical document collections, routinely captured in modern EMR systems, to passively characterize the primary health care system.</p>
      </sec>
      <sec>
        <title>Topic Models</title>
        <p>Several methods can be used to estimate a topic model, given a document collection, and to characterize the evolution of latent topical bases over time. Latent Dirichlet allocation (LDA) [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>] uses a Bayesian probabilistic graphical modeling framework to define a topic model. Learned topical vectors describe the affinity of a word (v=1...V) in the corpus for a particular topic (k=1...K). A latent admixing vector describes the affinity of a specific document (d=1...D) for a specific topic (k=1...K). The latent matrices in the LDA model are learned from document-word co-occurrence statistics empirically collected from the clinical note corpus. The traditional LDA model is not intended for modeling temporal document collections; however, Griffiths et al [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>] demonstrated how simple time-stratified estimators can be used to illustrate the evolution of latent topical vectors over time. The structural topic model (STM), extends the classical LDA model, allowing either (1) the matrix of per-document topical prevalence weights or (2) the matrix of per-topic word probabilities to deterministically vary according to covariate information parameterized using a generalized linear model [<xref ref-type="bibr" rid="ref6">6</xref>]. Several parameterizations of time can be incorporated into the generalized linear model (eg, discrete, continuous, or spline effects), allowing the STM to flexibly model the evolution of topical prevalence vectors over time. Nonnegative matrix factorization (NMF) [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>] uses a linear algebraic framework and principles from constrained optimization for topic modeling. NMF directly estimates the parameter matrices of a topic model by factorizing an observed document term matrix (DTM) into 2 latent nonnegative matrices. One of the latent parameter matrices describes the affinity of a document (d=1...D) to a topic (k=1...K), and the other latent matrix describes the affinity of a word (v=1...V) to a topic (k=1...K). Post hoc multivariate transformations of the NMF latent parameter matrices can be used to generate estimates of topical evolution over time. Recently, neural frameworks have been developed for topic modeling, such as top2vec [<xref ref-type="bibr" rid="ref10">10</xref>] and BERTopic [<xref ref-type="bibr" rid="ref11">11</xref>]. The BERTopic neural topic models begin by embedding documents into a latent vector space. A finite number of clusters (k=1...K) of semantically similar documents are identified in the embedding space. For each document cluster (k), the most relevant words describing the cluster or topic are extracted using a cluster-specific term-frequency inverse-document frequency (TF-IDF) weighting technique [<xref ref-type="bibr" rid="ref11">11</xref>].</p>
      </sec>
      <sec>
        <title>Study Objectives</title>
        <p>The objective of this study was to compare the performance of several temporal topic modeling methodologies fitted to a corpus of primary care clinical notes. We compared the following temporal topic modeling methodologies: NMF, LDA, STM, and BERTopic. We examined (1) the overall matrix of per-topic word probabilities estimated over the corpus and (2) the multivariate time series structures describing the evolution of latent topical prevalence weights (k=1...K) over discrete times (t=1...T). We compared the methods using a data set of longitudinal primary care clinical notes collected over 5 years (2011-2015) in Ontario, Canada.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Mathematically Representing and Computationally Processing Our Clinical Text Corpus</title>
        <p>Topic models use statistical information regarding document-word co-occurrence frequencies to learn meaningful latent variable representations from a corpus. Each document in the collection (d=1...D) is represented as a high-dimensional length-V vector (v=1...V), where each element is a count of the number of times a particular word or token (v) in an empirical vocabulary is observed in a particular document (d). We represented the collection of document-specific term-frequency vectors into a matrix X of dimension D*V, called the DTM. The DTM is a large, sparse matrix. However, the matrix is overdetermined because many of the rows (representing document-specific term-frequency vectors) and columns (representing word or token occurrence frequency over all documents in the corpus) demonstrate strong intercorrelations. Dimension-reduction techniques, such as topic models, use intercorrelated statistical semantic information to estimate meaningful thematic representations from document collections. Topic models learn (1) clusters of intercorrelated words describing the topical content of the corpus and (2) clusters of correlated documents sharing latent topical concepts.</p>
        <p>The most challenging and subjective aspect associated with construction of the DTM involves specification of the vocabulary or dictionary (v=1...V) encoding the column space of the matrix. A priori constructed lexicons or dictionaries (of dimension V) can be used to determine the study vocabulary. Specification of appropriate domain-specific dictionaries would be tasked with subject matter experts on the research team. Alternatively, an entirely computational approach could specify a text tokenization or normalization pipeline and computationally parse the input character sequences into a finite number of tokens.</p>
        <p>In this study, we adopted a hybrid approach to vocabulary or dictionary specification. We began by tokenizing the clinical notes on whitespace boundaries (spaces, tabs, newlines, carriage returns, etc). We normalized tokens using lower-case conversion and removed all nonalphabetic characters. We removed tokens with a character length ≤1. Finally, we sorted the list of tokens or words by decreasing occurrence frequency and manually reviewed the sorted list of tokens. Our manual review identified V=2930 distinct tokens for inclusion in our final vocabulary. The total number of tokens in the corpus was 3,003,583. The tokens chosen for inclusion in our final dictionary or vocabulary were mainly medical terms with precise semantic meanings (disease names, disease symptoms, drug names, medical procedures, medical specialties, anatomical locations, etc). We excluded stop words or tokens (ie, syntactic or functional tokens with little clinical semantic meaning). Words with low occurrence frequency were excluded for computational considerations. All text processing was conducted using R (R Foundation for Statistical Computing; version 3.6).</p>
      </sec>
      <sec>
        <title>Review of Methods for Temporal Topic Modeling</title>
        <sec>
          <title>NMF Model</title>
          <p>NMF estimates latent topical matrices using the document-word co-occurrence statistics contained in the empirical DTM. NMF factorizes the D*V dimensional DTM into 2 latent submatrices of dimensions D*K (θ) and K*V (Φ). The DTM (X) consists of nonnegative integers (ie, word frequency counts), whereas the learned matrices (θ,Φ) consist of nonnegative real values. Mathematically, the NMF objective involves learning optimal values of the latent matrices (θ,Φ) that best approximate the input data set (X ≈ θΦ), subject to the constraint that the learned matrices contain nonnegative values.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>We selected a least square loss function to train the NMF model. The objective function specifies that the observed data elements are approximated in a K-dimensional bilinear form <inline-graphic xlink:href="medinform_v10i12e40102_fig11.png" xlink:type="simple" mimetype="image"/>. The analyst must specify the dimensions of the latent space: K (the number of topics). Seminal articles on NMF include Paatero and Tapper [<xref ref-type="bibr" rid="ref7">7</xref>] and Lee and Seung [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Surveys of NMF and low-rank models are provided by Berry et al [<xref ref-type="bibr" rid="ref12">12</xref>] and Udell et al [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Post hoc, the row vectors constituting both θ and Φ, can be normalized by dividing by their respective row sums. The resulting normalized vectors can be interpreted as compositional or probability vectors (ie, each normalized row of θ and Φ contains nonnegative entries that sum to 1, row-wise). The row vectors of the matrix Φ encode a set of k=1...K per-topic word probabilities or proportions (estimated over a discrete set of v=1...V words in the empirical corpus vocabulary). The row vectors of the matrix θ encode a set of d=1...D per-document topic proportions (estimated over a discrete set of k=1...K latent dimensions), encoding the affinity a given document has for a particular topic.</p>
          <p>For each document d=1...D, assume we observe a time stamp that allows us to associate each document (and latent embedding) with a T-dimensional indicator variable denoting the observation time (t=1...T). We estimated a K-dimensional multivariate mean topical prevalence vector for each design point, t=1...T. This resulted in a multivariate time series structure (a T*K dimensional matrix). Each column (k=1...K) of the matrix is a length T time series that described the evolution of a latent topical vector.</p>
          <p>The sklearn.decomposition.NMF() function in the Python SKLearn package (version 0.24.2) was used to fit the NMF topic model.</p>
        </sec>
        <sec>
          <title>LDA Model</title>
          <p>LDA is a probabilistic topic model. Probabilistic topic models assume that a document comprises a mixture of topics. These (latent) topics represent a probability distribution over a finite vocabulary of words or tokens. Topic models can also be described as admixture models. Each document is a soft mixture of topics (k=1...K), where a topic is itself a probability distribution over words in the vocabulary (v=1...V). A graphical model describing LDA is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref> [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
          <p>The LDA graphical model also describes a generative process for creating a single document in the corpus. This can be succinctly described using the following sampling notation [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
          <p>To generate a document, we begin by sampling the per-topic word distributions from a Dirichlet distribution parameterized by a V dimensional prior concentration parameter (β). Topical vectors (k=1...K) are shared over the collection of documents.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Next, for each document d=1...D in the collection, we sample the per-document topic distribution from a Dirichlet distribution parameterized according to a K-dimensional prior concentration parameter (α).</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>For each word in each document, we sample a topical indicator variable, z<sub>d,n</sub>. This variable takes an integer value between 1 and K and signifies the per-topic word distribution from which a specific word, w<sub>d,n</sub>, is chosen. The index n denotes the n<sup>th</sup> word in a variable length document (n=1...N<sub>d</sub>).</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Finally, we draw a single word token, w<sub>d,n</sub>, from the topical distribution associated with z<sub>d,n</sub>. The word indicator is an element v=1...V in our empirical dictionary or vocabulary.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>The statistical inference problem associated with probabilistic topic modeling involves inverting the sampling process and learning model-defined latent parameters given the observed text data. The latent variables indicate which words are assigned to which topical indicators (z), which documents have an affinity for which topics (θ), and which words co-occur with high likelihood under which topics (Φ). The latent parameters associated with an LDA topic model are typically estimated using Bayesian statistical machinery (Gibbs sampling [<xref ref-type="bibr" rid="ref14">14</xref>], variational inference [<xref ref-type="bibr" rid="ref2">2</xref>], and other methods).</p>
          <p>A multivariate transformation of the matrix of per-document topical prevalence weights generates a multivariate time series data structure. This object is of dimension T*K, where each column k=1…K represented a univariate topical time series of length T. This series describes the evolution of latent topical vectors over our study period.</p>
          <p>The sklearn.decomposition.LatentDirichletAllocation() function in Python SKLearn (version 0.24.2) was used to fit the LDA topic model.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Graphical model representation of the latent Dirichlet allocation topic model.</p>
            </caption>
            <graphic xlink:href="medinform_v10i12e40102_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>STM Model</title>
          <p>The STM is another type of probabilistic topic model. The STM extends the LDA topic model, allowing latent matrices of (1) per-document topical prevalence weights or (2) per-topic word proportions to vary according to a generalized linear model parameterization [<xref ref-type="bibr" rid="ref6">6</xref>]. Covariate effects on the latent matrix of per-document topical prevalence weights are incorporated into the model using a logistic-normal prior distribution over per-document topical prevalence vectors, similar to the correlated topic model [<xref ref-type="bibr" rid="ref16">16</xref>]. Covariate effects on the latent matrix of per-topic word proportions are incorporated into the model using a type of multinomial logit prior. In this study, we modeled covariate effects (in our study, discrete time effects, t=1...T) on the matrix of per-document topic prevalence weights. We did not assume that the matrix of per-topic word proportions varied according to covariates. The plate notation of STM is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>. Variational methods are used for posterior inference in STM [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
          <p>To generate a document under STM, we begin by sampling the per-topic word distributions from an (intercept-only) multinomial logit model (where multinomial logit regression parameters are given sparse “gamma-lasso” prior) [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig17.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Next, we sample the per-document topic distribution from a logistic-normal distribution parameterized in terms of a mean vector and covariance matrix. γ represents a D*T dimensional design matrix encoding the time point (t=1...T) under which the document (d=1...D) was observed. The vector γ is a matrix of dimension T*K and encodes discrete time effects on each of the per-document topical prevalence weights (a length K vector for each document d=1...D). Finally, Σ is a K*K dimensional covariance matrix that encodes correlations between topical prevalence vectors (parameterized under a logistic-normal model).</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig18.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>For each word (n=1...N<sub>d</sub>) in each document (d=1...D), we sample a topical indicator variable z<sub>d,n</sub>. This variable takes an integer value between 1 and K and signifies the per-topic word distribution from which a specific word, w<sub>d,n</sub>, is chosen. It must be noted that the upper limit N<sub>d</sub> suggests that the number of words used for any given document (d) can vary.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig19.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Finally, we draw a single word or token, w<sub>d,n</sub>, from the topical distribution associated with z<sub>d,n</sub>. The word indicator is an element v=1...V in our empirical dictionary or vocabulary.</p>
          <disp-formula>
            <graphic xlink:href="medinform_v10i12e40102_fig20.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>The framework for STM naturally allows for the estimation of temporal effects on topical prevalence weights. In our study, discrete time effects on topical prevalence can be interpreted using the coefficient matrix (γ) from the fitted logistic-normal model. As the temporal effects are encoded in a Bayesian regression modeling framework, we can also compute inferential measures (posterior means, highest posterior density intervals, etc). The single-stage inferential mechanism encoded in STM is a clear strength over earlier NMF and LDA models.</p>
          <p>We used the stm() function in the STM package in R to fit the STM to our study data.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Graphical model representation of the structural topic model.</p>
            </caption>
            <graphic xlink:href="medinform_v10i12e40102_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Neural Topic Modeling via BERTopic</title>
          <p>Recently, researchers have developed topic models that integrate neural architectures and related techniques for model specification and learning. These neural topic models represent a different class of topic models compared with those introduced previously. Examples of recently developed neural topic models include top2vec [<xref ref-type="bibr" rid="ref10">10</xref>] and BERTopic [<xref ref-type="bibr" rid="ref11">11</xref>]. In this study, we focused on the BERTopic model.</p>
          <p>BERTopic begins with embedding documents empirically observed in the study corpus into a latent embedding space. Many methods exist for embedding discrete linguistic units (words, sentences, paragraphs, documents, etc) into an embedding space. For example, words can be embedded in a vector space using word2vec [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>], GloVe [<xref ref-type="bibr" rid="ref20">20</xref>], FastText [<xref ref-type="bibr" rid="ref21">21</xref>], ELMO [<xref ref-type="bibr" rid="ref22">22</xref>], Flair [<xref ref-type="bibr" rid="ref23">23</xref>], and transformer models [<xref ref-type="bibr" rid="ref24">24</xref>]. Sentences and documents can be embedded using methods such as doc2vec [<xref ref-type="bibr" rid="ref25">25</xref>], universal sentence encoders [<xref ref-type="bibr" rid="ref26">26</xref>], and transformers [<xref ref-type="bibr" rid="ref24">24</xref>]. The BERTopic model used in this study relies on sentence transformers [<xref ref-type="bibr" rid="ref27">27</xref>], particularly the MPNet sentence transformer model [<xref ref-type="bibr" rid="ref28">28</xref>]. The neural embedding model is a discrete “hyperparameter” in the BERTopic modeling pipeline. Different choices of neural embedding models are associated with their own model-specific hyperparameters (embedding dimension, context window width, model training or optimization arguments, etc).</p>
          <p>Each document (d=1...D) is embedded in a vector space, typically of a few hundred dimensions. The uniform manifold approximation and projection (UMAP) algorithm [<xref ref-type="bibr" rid="ref29">29</xref>] was used as a further nonlinear dimension-reduction technique to assist in the visualization and clustering of document vectors. Clustering was accomplished in the UMAP-reduced space using the hierarchical density-based spatial clustering algorithm of applications with noise (HDBSCAN) [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
          <p>Clusters (k=1...K) of semantically related documents were identified. Scores over words v=1...V in the vocabulary were computed using cluster-specific TF-IDF weights. If a cluster consisted of semantically focused documents, and hence words, we expect to observe coherent and meaningful words identified via TF-IDF scoring. The proportion of documents assigned to each cluster during a specific period (t=1...T) can be used to generate a T*K dimensional multivariate time series structure, depicting the evolution of latent topic over our study period.</p>
          <p>We fitted the BERTopic model using default hyperparameter settings. The BERTopic pipeline requires (1) specification of a document embedding algorithm (in our case, the MPNet sentence transformer model [<xref ref-type="bibr" rid="ref28">28</xref>]), (2) the UMAP nonlinear dimension-reduction algorithm, (3) the HDBSCAN algorithm for cluster identification, and (4) cluster-specific TF-IDF scoring. The individual components of the pipeline could involve substantive hyperparameter optimization. In this study, we used the default model hyperparameter settings.</p>
          <p>We used the Python package bertopic to fit BERTopic models.</p>
        </sec>
      </sec>
      <sec>
        <title>Statistical Methods for Corpus Description and Evaluation of Learned Temporal Topic Models</title>
        <p>We used simple counts and percentages to describe the characteristics of our study sample. We described the number of unique patients and number of unique clinical notes. Each patient in our sample was a “high-user” of the primary care system, in the sense they generated at least one encounter/note for each of the twenty quarterly time periods between 2011-2015. We described the distribution of the number of notes per patients. We described demographic characteristics of the sample (age/sex distributions).</p>
        <p>When fitting the NMF, LDA, and STM models, we constructed a DTM whose row dimension corresponded to the number of unique patients in the sample (ie, 1727 unique patients) multiplied by the number of distinct time periods (<italic>t</italic>=20; 1727×20=34,540). Each term-frequency vector observed in the DTM was length V (V=2930), and an individual element counted the number of times a given word was observed for a given patient in each quarterly period. Across the DTM, we counted the total number of words and the number of unique words. We described the counts and percentages of the top 25 most prevalent words in our clinical note corpus. We also described the sparsity of the DTM.</p>
        <p>For each of the NMF, LDA, STM, and BERTopic models, we constructed a K*T dimensional multivariate time series matrix (this is the transpose of the T*K data structure described earlier). Each row corresponds to a latent topic vector and each column corresponds to a specific quarterly time period. A row vector is a length T time series describing the evolution of a latent topical vector across the study periods. Each column corresponds to a distribution over topics at a particular period (ie, described which topics are most important at a given period). For each row k=1...K, we report the top 5 words loading most strongly on a given topic. The cluster of words was semantically correlated and described the essence of the latent topical vector. A heatmap was used to visualize this high-dimensional multivariate time series structure; and we hierarchically clustered the rows of the matrix using a Euclidean distance metric and Ward agglomeration method (a dendrogram was used to visualize the cluster structure of the topical series).</p>
        <p>The topical structure of each of the NMF, LDA, STM, and BERTopic model fits was described in terms of the top 5 words loading most strongly on each of the k=1...K latent topics. In other words, the topical structure of each model can be described in terms of a “bag” of 250 words or tokens. We investigated the topical diversity of the model fits. Topical diversity was calculated in terms of the number of unique words in the bag of 250 total words. Furthermore, we investigated the top 5 most frequently occurring words in the “bag” describing each model fit. The redundantly occurring words in the topical summaries provided a rough approximation of the semantic concepts that the models repeatedly identified as important.</p>
        <p>We investigated several measures of topical coherence for the NMF, LDA, STM, and BERTopic models. We considered the “UMASS,” “UCI,” and normalized pointwise mutual information (“NPMI”) metrics described in the surveys of Roder et al [<xref ref-type="bibr" rid="ref31">31</xref>] and Rosner et al [<xref ref-type="bibr" rid="ref32">32</xref>]. These metrics assessed the internal consistency of the collection of word clusters describing the topical structure of the NMF, LDA, STM, and BERTopic models. The theoretical minima or maxima of each coherence measure varies; however, larger values indicate models that generated more coherent topical characterizations. Mathematical details related to the calculation of the aforementioned topical coherence metrics are provided later and further outlined in the studies by Roder et al [<xref ref-type="bibr" rid="ref31">31</xref>] and Rosner et al [<xref ref-type="bibr" rid="ref32">32</xref>]. In all the equations used, we assumed that a topical vector is described in terms of its top-L most probable words or tokens; {<italic>w<sub>i</sub>,w<sub>j</sub></italic>} represented distinct words from the top-L set, ε is a small positive constant to avoid potential numerical issues in computation; and δ is a weighting term (used in the normalized NPMI estimates, compared with the unnormalized pointwise mutual information estimates used in the UCI coherence measure).</p>
        <disp-formula>
          <graphic xlink:href="medinform_v10i12e40102_fig23.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="medinform_v10i12e40102_fig24.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="medinform_v10i12e40102_fig21.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>We used a set-based measure of concordance, the Jaccard coefficient, to assess similarities or differences in the topical structure describing the NMF, LDA, STM, and BERTopic models. Each model was described in terms of a “bag” of 250 words or tokens (ie, k=50 topics, described in terms of their top 5 most probable words); consider 2 models generating bags of words or tokens, b<sub>0</sub> and b<sub>1</sub>. The Jaccard coefficient is defined as the cardinality of the intersection of b<sub>0</sub> and b<sub>1</sub> divided by the cardinality of the union of b<sub>0</sub> and b<sub>1</sub>. In mathematical notation, the Jaccard coefficient is expressed as follows:</p>
        <disp-formula>
          <graphic xlink:href="medinform_v10i12e40102_fig22.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Finally, we described the wall time (in seconds or minutes) required to fit each of the NMF, LDA, STM, and BERTopic models. We also discussed the computational issues associated with hyperparameter tuning of each of the models.</p>
      </sec>
      <sec>
        <title>Study Design, Setting, Data Sources, and Inclusion or Exclusion Criteria</title>
        <p>This study used a retrospective closed cohort design. Clinical notes were obtained from primary care EMR systems geographically distributed across Ontario, Canada. We included all clinical notes written by the patient’s primary care provider between January 01, 2011, and December 31, 2015. We discretized time into quarterly strata (January-March; April-June; July-September; and October-December). Patients were excluded if they did not have at least one clinical note in each of the 20 quarterly strata over the study period. Hence, the selected sample of patients reflects a unique set of individuals who frequently engaged with the primary health care system.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Description of Corpus and Study Sample</title>
        <p>Our document collection contained 160,478 clinical notes from 1727 patients. The 1727 patients received primary care services from 1066 unique primary care physicians at 40 unique primary care clinics (geographically distributed across Ontario, Canada). The median age of the patients was 68 (IQR 55-80) years and ranged from 20 to 103 years (age statistics were calculated using study baseline as a reference date, January 1, 2011). Female patients were observed more frequently than male patients (1157/1727, 67% vs 570/1727, 33%). <xref ref-type="table" rid="table1">Table 1</xref> describes the characteristics of the study sample (in terms of both note-level and patient-level units of analysis).</p>
        <p>The initial note-level DTM had dimensions of 160,478 rows (one row for each clinical note in the corpus) by 2930 columns (one column for each unique word or token in the corpus). The corpus comprised 3,003,583 tokens. The DTM was &#62;99% sparse (ie, it contained almost all zero elements). We also constructed a patient-quarter–level DTM by aggregating notes observed on the same patient within a quarter. This DTM had dimensions of 1727×20=34,540 rows by 2930 columns and was &#62;98% sparse. The top 25 most frequently occurring words in the analytic corpus are listed in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Descriptive statistics for study sample, at note-level and patient-level unit of analysis.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="190"/>
            <col width="390"/>
            <col width="390"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Characteristic</td>
                <td>Unique notes (n=160,478), n (%)</td>
                <td>Unique patients (n=1727), n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>Age (years)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>20-40</td>
                <td>9713 (6.1)</td>
                <td>107 (6.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>40-65</td>
                <td>63,588 (39.6)</td>
                <td>675 (39.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>65-85</td>
                <td>63,839 (39.8)</td>
                <td>704 (40.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>&#62;85</td>
                <td>23,338 (14.5)</td>
                <td>241 (14)</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Sex</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Male</td>
                <td>51,530 (32.1)</td>
                <td>570 (33)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Female</td>
                <td>108,948 (67.9)</td>
                <td>1157 (67)</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Year</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2011</td>
                <td>28,012 (17.5)</td>
                <td>—<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2012</td>
                <td>31,220 (19.5)</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2013</td>
                <td>33,676 (21)</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2014</td>
                <td>33,756 (21)</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2015</td>
                <td>33,814 (21)</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Top 25 most frequently occurring tokens or words in the final analytic primary care clinical note corpora (N=3,003,583).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="350"/>
            <col width="650"/>
            <thead>
              <tr valign="top">
                <td>Token or word</td>
                <td>Occurrence frequency, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>pain</td>
                <td>88,132 (2.93)</td>
              </tr>
              <tr valign="top">
                <td>mg</td>
                <td>65,612 (2.18)</td>
              </tr>
              <tr valign="top">
                <td>inr</td>
                <td>52,970 (1.76)</td>
              </tr>
              <tr valign="top">
                <td>bp</td>
                <td>50,751 (1.69)</td>
              </tr>
              <tr valign="top">
                <td>back</td>
                <td>43,556 (1.45)</td>
              </tr>
              <tr valign="top">
                <td>dose</td>
                <td>29,861 (0.99)</td>
              </tr>
              <tr valign="top">
                <td>feels</td>
                <td>24,736 (0.82)</td>
              </tr>
              <tr valign="top">
                <td>rx</td>
                <td>23,211 (0.77)</td>
              </tr>
              <tr valign="top">
                <td>chest</td>
                <td>22,256 (0.74)</td>
              </tr>
              <tr valign="top">
                <td>meds</td>
                <td>20,914 (0.7)</td>
              </tr>
              <tr valign="top">
                <td>referral</td>
                <td>19,409 (0.65)</td>
              </tr>
              <tr valign="top">
                <td>work</td>
                <td>19,398 (0.65)</td>
              </tr>
              <tr valign="top">
                <td>wt</td>
                <td>19,322 (0.64)</td>
              </tr>
              <tr valign="top">
                <td>feeling</td>
                <td>17,415 (0.58)</td>
              </tr>
              <tr valign="top">
                <td>blood</td>
                <td>16,121 (0.54)</td>
              </tr>
              <tr valign="top">
                <td>symptoms</td>
                <td>15,905 (0.53)</td>
              </tr>
              <tr valign="top">
                <td>prn</td>
                <td>15,706 (0.52)</td>
              </tr>
              <tr valign="top">
                <td>urine</td>
                <td>14,633 (0.49)</td>
              </tr>
              <tr valign="top">
                <td>bw</td>
                <td>13,779 (0.46)</td>
              </tr>
              <tr valign="top">
                <td>lab</td>
                <td>13,543 (0.45)</td>
              </tr>
              <tr valign="top">
                <td>clear</td>
                <td>13,271 (0.44)</td>
              </tr>
              <tr valign="top">
                <td>knee</td>
                <td>12,677 (0.42)</td>
              </tr>
              <tr valign="top">
                <td>pharmacy</td>
                <td>12,503 (0.42)</td>
              </tr>
              <tr valign="top">
                <td>sleep</td>
                <td>12,331 (0.41)</td>
              </tr>
              <tr valign="top">
                <td>prescription</td>
                <td>11,945 (0.4)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparing Temporal Topic Models Estimated With NMF, LDA, STM, and BERTopic Models</title>
        <p>We comparatively evaluated inferences obtained from fitting the NMF, LDA, STM, and BERTopic models to our primary care clinical note corpus. For each model, we varied the number of topics (K={25,40,45,50,55,60,75}) and observed similar inferences at various levels of the model complexity parameter (K). When K was too small, distinct semantic topics tended to be grouped together, whereas when K was too large, semantically similar topics tended to be split into arbitrary clusters (resulting in an overclustering effect). Using human judgment evaluation, we determined that a model complexity of K=50 topics balanced a parsimonious, while simultaneously expressive, characterization of the clinical document corpus. For each of the NMF, LDA, STM, and BERTopic models, we reported the results assuming K=50 latent topics.</p>
        <p>A summary of the distribution of words over the k=1...50 latent topics (for each of the 4 models under comparison) is given in <xref rid="figure3" ref-type="fig">Figures 3</xref>-<xref rid="figure6" ref-type="fig">6</xref>, respectively. The y-axis in each figure lists the top 5 words loading most strongly on a given topic. For NMF, LDA, and STM, we reported topical prevalence weights associated with each word or token (which is approximately the probability of observing the word or token under a given latent topic). For the BERTopic model, we reported normalized cluster-specific TF-IDF scores associated with words under topics (which can be interpreted similarly to the outputs of the NMF, LDA, and STM models). The x-axis of these plots represents t=1...20 quarterly periods. A column in the plot represents a topical prevalence distribution over latent topics at a given time point. A row in the plot illustrates the evolution of a latent topic over the study period.</p>
        <p>Each of the 4 latent temporal topic models learned a meaningful representation of the primary care clinical notes corpus. In the following paragraphs, we discuss (1) topics consistently estimated across models that demonstrated constant trends in topical prevalence across quarterly periods and (2) topics consistently estimated across quarterly periods that demonstrated interesting seasonal patterns.</p>
        <p>Each of the fitted models consistently identified the following latent primary care topical constructs (and these topics show constant patterns across quarterly periods): sleep (NMF=Topic−45; LDA=Topic-2 or Topic-31; STM=Topic-11; BERTopic=not applicable); mental health, for example, mood, anxiety, and depression, (NMF=Topic-33; LDA=Topic-22; STM=Topic-19; BERTopic=Topic-16); pain (NMF=Topic-1; LDA=Topic-39, Topic-36, Topic-14, Topic-49, Topic-34, or Topic-37; STM=Topic-8; BERTopic=Topic-9 or Topic-39); blood pressure control and monitoring (NMF=Topic-36; LDA=Topic-9; STM=Topic-21; BERTopic=Topic-31); respiratory disease, for example, cough, throat, chest, fever, etc (NMF=Topic-46; LDA=Topic-13; STM=Topic-46; BERTopic=Topic-1), smoking (NMF=Topic-31; LDA=Topic-32; STM=Topic-44; BERTopic=Topic-38); diabetes, for example, blood, sugar, insulin, fbs, etc (NMF=Topic-5; LDA=Topic-43; STM=Topic-42; BERTopic=Topic-8); pharmaceutical prescription management (NMF=Topic-26; LDA=Topic-40; STM=Topic-9; BERTopic=Topic-36 or Topic-5); and annual influenza vaccination programs (NMF=Topic-6; LDA=Topic-29; STM=Topic-36; BERTopic=Topic-50). These thematic areas represented archetypical patients, conditions, or roles encountered in the primary health care system. The consistent extraction of latent themes (represented as semantically correlated word clusters) suggests that each model can leverage information regarding word-context co-occurrence to learn meaningful patterns from a large unstructured clinical document corpus.</p>
        <p><xref rid="figure3" ref-type="fig">Figures 3</xref>-<xref rid="figure6" ref-type="fig">6</xref> illustrate 4 different temporal topic model multivariate time series structures. For a given plot, the x-axis represents time (t=1...20 quarterly periods from 2011-2015), and the y-axis represents a topical vector (k=1...50). The intensity of color in the cell (t,k) indicates the extent to which an encounter at time (t) is related to a latent topic (k). Topical labels are exchangeable and clustered along the y-axis, according to the similarity of the topical time series (a dendrogram describing the similarity or differences across topical clusters is illustrated in <xref rid="figure7" ref-type="fig">Figure 7</xref>). <xref rid="figure3" ref-type="fig">Figure 3</xref>-<xref rid="figure6" ref-type="fig">6</xref> represent different multivariate time series structures estimated with NMF (<xref rid="figure3" ref-type="fig">Figure 3</xref>), LDA (<xref rid="figure4" ref-type="fig">Figure 4</xref>), STM (<xref rid="figure5" ref-type="fig">Figure 5</xref>), and BERTopic (<xref rid="figure6" ref-type="fig">Figure 6</xref>).</p>
        <p>For certain learned topics, seasonal harmonic patterns were stably estimated over the study period. For example, the annual influenza vaccination program consistently occurred in the fall or winter months of the study (NMF=Topic-6; LDA=Topic-29; STM=Topic-36; BERTopic=Topic-50). Similarly, annual spikes in respiratory diseases (cough, cold, influenza, etc) are identified as achieving peaks in the winter months and lows in the summer months (NMF=Topic-46; LDA=Topic-13; STM=Topic-46; BERTopic=Topic-1). These findings are illustrated in <xref rid="figure3" ref-type="fig">Figures 3</xref>-<xref rid="figure6" ref-type="fig">6</xref>; however, we also present individual time series plots of these topics in <xref rid="figure8" ref-type="fig">Figures 8</xref> and <xref rid="figure9" ref-type="fig">9</xref>, so the reader can better appreciate the ability of the different temporal topical models to extract consistent seasonal patterns from the primary care clinical document corpus. Findings regarding consistent seasonal variation in primary care roles over time have strong face validity and are corroborated by complementary data sources (eg, administrative data). Furthermore, the consistency by which these patterns are extracted from our large clinical document collection helps build trust in the opportunity to use word-context co-occurrence statistics (and topic models) to characterize and monitor primary care practices and systems.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>A heat map of the multivariate time series structure associated with the nonnegative matrix factorization temporal topic model.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e40102_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>A heat map of the multivariate time series structure associated with the latent Dirichlet allocation temporal topic model.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e40102_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>A heat map of the multivariate time series structure associated with the structural topic model temporal topic model.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e40102_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>A heat map of the multivariate time series structure associated with the BERTopic temporal topic model.</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e40102_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Dendrograms displaying the clustering structure of the latent multivariate time series objects learned from nonnegative matrix factorization model (A), latent Dirichlet allocation model (B), structural topic model (C) and BERTopic model (D).</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e40102_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Descriptive time series plots characterizing the seasonal evolution of annual influenza program topic, as estimated by nonnegative matrix factorization model (A), latent Dirichlet allocation model (B), structural topic model (C) and BERTopic-models (D).</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e40102_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure9" position="float">
          <label>Figure 9</label>
          <caption>
            <p>Descriptive time series plots characterizing the seasonal evolution of the respiratory disease topic, as estimated by nonnegative matrix factorization model (A), latent Dirichlet allocation model (B), structural topic model (C) and BERTopic-models (D).</p>
          </caption>
          <graphic xlink:href="medinform_v10i12e40102_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Post Hoc Internal Evaluation of Fitted Temporal Topic Models</title>
        <p>When investigating the top-ranked words associated with per-word topic distributions in <xref rid="figure3" ref-type="fig">Figures 3</xref>-<xref rid="figure6" ref-type="fig">6</xref> we note that each model can describe the corpus using a “bag” of up to 250 unique words (K=50 topics multiplied by top 5 words being presented for each latent topical representation). The number of unique words—also known as the topic diversity—observed in NMF, LDA, STM, and BERTopic model fits was 76.4% (191/250), 88.4% (221/250), 87.6% (219/250), and 77.2% (193/250), respectively. The top 5 most frequently recurring words or tokens describing the topical structure of each of the NMF, LDA, STM, and BERTopic models are listed in <xref ref-type="table" rid="table3">Table 3</xref>. Recurring words for LDA and STM are similar, suggesting that primary care issues related to back pain (and other musculoskeletal pain) are important, as are issues related to hypertension and feelings (eg, mood disorders). Conversely, the BERTopic model seems to prioritize primary care issues related to prescription drugs and laboratory ordering or management.</p>
        <p>We explored the semantic coherence of NMF, LDA, STM, and BERTopic models using the following metrics: “UMASS,” “UCI,” and “NPMI” (<xref ref-type="table" rid="table4">Table 4</xref>) [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Larger coherence metrics indicated increasingly internally consistent latent topical characterizations. The “UMASS” metric favored the STM model, whereas, the “UCI” and “NPMI” metrics favored the BERTopic model.</p>
        <p>To investigate the differences and similarities in the fitted topic model, we used the Jaccard coefficient (<xref ref-type="table" rid="table5">Table 5</xref>). Using the Jaccard measure of concordance, the Bayesian models (LDA or STM) were identified as resulting in the most similar fit. The BERTopic model generated the most distinct topical representation compared with the other models.</p>
        <p>The time required to train each model was reported. For NMF, LDA, and STM models, we used a single central processing unit (although Python SKLearn implementations of decomposition models can be parallelized). For the BERTopic model, we used a single graphics processing unit for embedding documents and a single central processing unit for dimensionality reduction (UMAP) and clustering (HDBSCAN). Under these settings, the time required to fit the NMF, LDA, STM, and BERTopic models was 237 seconds, 67 seconds, 879 seconds (14.7 minutes), and 2624 seconds (43.7 minutes), respectively. The computational requirements of the BERTopic model exceeded those of the other models, particularly the highly optimized NMF or LDA implementations in Python SKLearn.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>The most frequently occurring tokens observed in each of the bags of 250 words describing the topical structure of latent Dirichlet allocation (LDA), nonnegative matrix factorization (NMF), structural topic model (STM) and BERTopic model fits (and their occurrence counts in the bag).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="230"/>
            <col width="210"/>
            <col width="160"/>
            <col width="150"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>Word or token</td>
                <td colspan="4">Topic model</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NMF (n)</td>
                <td>LDA (n)</td>
                <td>STM (n)</td>
                <td>BERTopic (n)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Word or token-1</td>
                <td>head (4)</td>
                <td>back (9)</td>
                <td>back (5)</td>
                <td>inr (11)</td>
              </tr>
              <tr valign="top">
                <td>Word or token-2</td>
                <td>mg (4)</td>
                <td>bp (6)</td>
                <td>mg (5)</td>
                <td>mg (9)</td>
              </tr>
              <tr valign="top">
                <td>Word or token-3</td>
                <td>ccac (3)</td>
                <td>pain (6)</td>
                <td>pain (5)</td>
                <td>lab (5)</td>
              </tr>
              <tr valign="top">
                <td>Word or token-4</td>
                <td>diabetes (3)</td>
                <td>chest (3)</td>
                <td>bp (4)</td>
                <td>prescription (5)</td>
              </tr>
              <tr valign="top">
                <td>Word or token-5</td>
                <td>feeling (3)</td>
                <td>feels (3)</td>
                <td>feels (3)</td>
                <td>dose (4)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Topical coherence measures (“UMASS,” “UCI,” and normalized pointwise mutual information [“NPMI”]) estimated on each of the nonnegative matrix factorization (NMF), latent Dirichlet allocation (LDA), structural topic model (STM) and BERTopic models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td>Topical coherence measure</td>
                <td colspan="4">Topic model</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NMF</td>
                <td>LDA</td>
                <td>STM</td>
                <td>BERTopic</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>UMASS</td>
                <td>−2.522</td>
                <td>−2.488</td>
                <td>−2.372</td>
                <td>−2.591</td>
              </tr>
              <tr valign="top">
                <td>UCI</td>
                <td>1.220</td>
                <td>0.987</td>
                <td>1.192</td>
                <td>1.405</td>
              </tr>
              <tr valign="top">
                <td>NPMI</td>
                <td>0.183</td>
                <td>0.149</td>
                <td>0.190</td>
                <td>0.230</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Jaccard coefficient metrics of set-based concordance between fitted topic models: nonnegative matrix factorization (NMF), latent Dirichlet allocation (LDA), structural topic model (STM), and BERTopic.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="240"/>
            <col width="170"/>
            <col width="170"/>
            <col width="170"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NMF</td>
                <td>LDA</td>
                <td>STM</td>
                <td>BERTopic</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>NMF</td>
                <td>—<sup>a</sup></td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>LDA</td>
                <td>0.526</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>STM</td>
                <td>0.491</td>
                <td>0.577</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>BERTopic</td>
                <td>0.343</td>
                <td>0.286</td>
                <td>0.329</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this study, we compared several distinct methodologies (ie, NMF, LDA, STM, and BERTopic) to estimate temporal topic models from a large collection of primary care clinical notes. Despite differences in the underlying statistical methodology, models often converged on a consistent latent characterization of the corpus. Furthermore, the temporal evolution of latent topics was reliably extracted from each of the NMF, LDA, STM, and BERTopic models.</p>
        <p>Clinically, our data set represented high-users of the primary care system. Many of the latent topics emerging from this analysis are consistent with a high-user archetype, for example, family counseling or social work, mood disorders, anxiety or depression, chronic pain, arthritis and musculoskeletal disorders, neurological conditions, cardiovascular disease and hypertension, diabetes, cancer screening (breast, cervical, colorectal, and prostate), laboratory requisitions and blood work, diagnostic imaging, and pharmaceutical or prescription management. Topic models also identified numerous acute health conditions as important latent themes, such as cough, cold and other respiratory infections, urinary tract infections, skin conditions, and wound care. NMF, LDA, STM, and BERTopic models each consistently captured (1) annual primary care influenza programs and (2) seasonal respiratory conditions, demonstrating predictable seasonal variation. Findings regarding primary care use patterns, extracted solely from clinical text data, were largely corroborated by provincial reporting based on structured administrative data [<xref ref-type="bibr" rid="ref33">33</xref>].</p>
        <p>We observed that disparate statistical methodologies for estimating temporal topic models generated a concordant or consistent latent representation. We interpreted this to mean that as the signal-to-noise ratio increases in a given clinical text data set, the subtle choice of statistical methodology seems to matter less, and any of these methods would extract a meaningful latent representation of the primary care corpus. For smaller corpora, where word-document co-occurrence statistics are less certain, this hypothesis may not hold.</p>
        <p>Furthermore, subtle or nuanced differences in model representations emerged, which may lead analysts to favor specific modeling strategies in particular settings. For example, consider <xref rid="figure8" ref-type="fig">Figure 8</xref> for the annual influenza vaccination program. Models such as NMF and LDA are purely unsupervised and do not consider external covariate information when formulating the model objective function. For NMF or LDA models we noticed that the “grand mean” topical prevalence over time centers at approximately 2% (ie, 1/50 topics). Conversely, an STM intentionally incorporates covariate information in the Bayesian graphical models’ prior structure, and we observed that for STM, the lows for annual influenza topic are much closer to 0%, whereas the fall or winter peaks are more pronounced. The BERTopic model does not intentionally incorporate covariate information into its objective function(s) either; however, it adopts a more “local averaging” principle to estimate topical distributions over time and, as such, demonstrated similar seasonal harmonic patterns as STM in the context of the annual influenza program. Similar patterns can be observed in <xref rid="figure9" ref-type="fig">Figure 9</xref> for seasonal respiratory diseases. This suggests that different topic models may perform more or less optimally in certain scientific settings (ie, may be dependent on the research question, available data, and how these foundational aspects of a study interplay with model choice). A priori, should the analyst or researcher expect topical prevalence to vary about select observable covariates, it may make sense to adopt a more flexible model that can adequately incorporate this anticipated behavior. If there is no a priori rationale to believe that topical prevalence varies as a function of covariates (eg, time in this study), then the choice of model may become less relevant, as all models may perform similarly well.</p>
        <p>Because of the different statistical principles associated with each temporal topic modeling methodology, each method is associated with its own strengths and weaknesses. We have elaborated on the methodological and computational issues associated with each class of models.</p>
        <p>First, NMF is the most mature and seemingly parsimonious methodology for topic modeling. NMF is strongly rooted in linear algebraic principles and is fundamentally based on the constrained optimization of a simple least squares objective function. Vanilla NMF is a well-studied statistical methodology and many efficient computational routines exist for estimating NMF models. NMF is flexible and can be readily extended. Possible model extensions can be viewed as discrete tunable hyperparameters in the model fitting process. Berry et al [<xref ref-type="bibr" rid="ref12">12</xref>] and Cichocki et al [<xref ref-type="bibr" rid="ref34">34</xref>] discussed distinct algorithmic techniques for estimating the latent parameters of an NMF model, such as gradient descent, multiplicative updates, and alternating nonnegative least squares. The choice of algorithm can be conceived as a discrete tunable hyperparameter. Furthermore, analysts are often confronted with the choice of whether to regularize the latent parameter matrices [<xref ref-type="bibr" rid="ref35">35</xref>]. Ridge, lasso, and elastic net regularization are commonly encountered, although more complex regularization can be used to encourage latent representations with smoothness, minimal volume, and other characteristics. Furthermore, many researchers have attempted to introduce coherent generalizations of NMF and related techniques [<xref ref-type="bibr" rid="ref13">13</xref>]. For example, generalized low-rank models that flexibly incorporate different loss functions, functional forms, weighting of data points, and regularization have been discussed by Udell et al [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
        <p>LDA and STM are Bayesian topic models. LDA was developed as a fully Bayesian extension of existing linear algebraic-based (eg, latent semantic analysis) and maximum likelihood-based (eg, probabilistic latent semantic indexing) techniques for topic modeling [<xref ref-type="bibr" rid="ref2">2</xref>]. LDA has been extended in various ways, illustrating the flexibility of Bayesian probabilistic graphical models. For example, STM is a direct extension of LDA, which allows latent parameter matrices to vary as a function of observed covariates [<xref ref-type="bibr" rid="ref6">6</xref>]. Efficient computational fitting routines have been developed for LDA, and STM to a certain extent. Analysts face several decisions when fitting LDA and STM models to empirical data sets, including Bayesian inferential or computational methods (eg, Gibbs sampling vs variational inference) and prior distribution specifications.</p>
        <p>BERTopic represents the most novel approach to topic modeling [<xref ref-type="bibr" rid="ref11">11</xref>]. The BERTopic model is a pipeline: (1) deep neural networks (eg, sentence transformer models) embed documents in a vector space; (2) nonlinear dimension reduction is applied to latent document vectors (UMAP); (3) document clusters are identified (HDBSCAN); and (4) representative topics (collections of semantically correlated words) are extracted from document clusters using a cluster-specific TF-IDF scoring method. A disadvantage of the BERTopic pipeline is related to computational requirements. For large corpora, a graphics processing unit is required to learn document embeddings within a reasonable time. In our study, we randomly down-sampled our data set (3/8 documents were included, whereas 5/8 documents were excluded), even with a graphics processing unit. That said, the BERTopic model’s strength is related to its modularity. We observed that the BERTopic model generates meaningfully coherent topics, and as neural embedding methods continue to evolve, we anticipate that state-of-the-art document embedding techniques can be dropped into this pipeline.</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>We attempted to be transparent with respect to how our final vocabulary of words or tokens was selected and accordingly the DTMs were constructed for this study. Different computational pipelines could have been used to preprocess our clinical text corpus. For instance, we could have used different strategies for tokenization, lemmatization, stemming, stop-word removal, and frequency-based word or token removal. Different text preprocessing pipelines would ultimately lead to different DTM structures (with different vocabularies). Further research is needed to better understand the implications of these text preprocessing decisions on downstream study inferences.</p>
        <p>Each topic model considered in this study requires specification of hyperparameters that govern the aspects of model fitting. Fitting these topic models is computationally intensive for large input data sets. We focused mainly on the stability and robustness of inferences with respect to model complexity (K), a common hyperparameter across all models. We did not explore the stability of the inferences across other model-specific hyperparameters.</p>
        <p>We did not consider all possible methods for estimating temporal topic models in this study. Bespoke NMF and LDA variants exist that are applicable for estimating temporal topic models. Sequential NMF [<xref ref-type="bibr" rid="ref36">36</xref>] and dynamic LDA [<xref ref-type="bibr" rid="ref37">37</xref>] are 2 extensions which are relevant for estimating temporal topic models. Tensor factorization models such as the canonical polyadic decomposition or Tucker decomposition, which factorize a D*V*T tensor into meaningful latent parameter matrices, may also be applicable [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Additional surveys related to topic modeling are provided in the studies by Churchill and Singh [<xref ref-type="bibr" rid="ref39">39</xref>], Zhao et al [<xref ref-type="bibr" rid="ref40">40</xref>], and Boyd-Graber et al [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
        <p>These works have led us to consider several possible ways of extending different topic modeling frameworks, including Bayesian NMF with document-level covariates (similar to the STM extension of LDA), neural matrix factorization with (nontemporal) covariates, LDA or STM extensions that allow per-document topical prevalence weights to vary according to a flexible generalized linear mixed model or multilevel model (for modeling dependencies introduced because of the complex design or sampling mechanism by which documents are created), and computational methods for improving statistical inference (eg, interval estimation and hypothesis testing) when engaging with temporal topic models (eg, resampling methods, bootstrap, and multiple outputation).</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we compared several statistical techniques for estimating temporal topic models from primary care clinical text data. Different temporal topic models have unique strengths and weaknesses owing to their underlying statistical properties. Nonetheless, each model consistently estimated a latent variable representation of a primary care document collection, which meaningfully characterized high-use primary care patients and their longitudinal interactions with the primary health care system. As the adoption of EMRs increases and health care organizations amass increasingly large volumes of clinical text data, temporal topic models may offer a mechanism for leveraging unstructured clinical text data for characterization and monitoring of primary care practices and systems.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">DTM</term>
          <def>
            <p>document term matrix</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">HDBSCAN</term>
          <def>
            <p>hierarchical density-based spatial clustering algorithm of applications with noise</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LDA</term>
          <def>
            <p>latent Dirichlet allocation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NMF</term>
          <def>
            <p>nonnegative matrix factorization</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">NPMI</term>
          <def>
            <p>normalized pointwise mutual information</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">STM</term>
          <def>
            <p>structural topic model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">TF-IDF</term>
          <def>
            <p>term-frequency inverse-document frequency</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">UMAP</term>
          <def>
            <p>uniform manifold approximation and projection</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by funding provided by a Foundation Grant (FDN 143303) from the Canadian Institutes of Health Research. The funding agency had no role in the study design; collection, analysis, or interpretation of data; writing of the report; or decision to submit the report for publication. Dr Austin is supported by a Mid-Career Investigator Award from the Heart and Stroke Foundation.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mossialos</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Djordjevic</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Osborn</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sarnak</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>International profiles of health care systems</article-title>
          <source>The Commonwealth Fund</source>
          <year>2017</year>
          <month>5</month>
          <day>31</day>
          <access-date>2022-09-30</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.commonwealthfund.org/publications/fund-reports/2017/may/international-profiles-health-care-systems">https://www.commonwealthfund.org/publications/fund-reports/2017/may/international-profiles-health -care-systems</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Latent dirichlet allocation</article-title>
          <source>J Mach Learn Res</source>
          <year>2003</year>
          <month>1</month>
          <day>3</day>
          <volume>3</volume>
          <fpage>993</fpage>
          <lpage>1022</lpage>
          <pub-id pub-id-type="doi">10.5555/944919.944937</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Probabilistic topic models</article-title>
          <source>Commun ACM</source>
          <year>2012</year>
          <month>04</month>
          <volume>55</volume>
          <issue>4</issue>
          <fpage>77</fpage>
          <lpage>84</lpage>
          <pub-id pub-id-type="doi">10.1145/2133806.2133826</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Steyvers</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Finding scientific topics</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2004</year>
          <month>04</month>
          <day>06</day>
          <volume>101 Suppl 1</volume>
          <issue>suppl_1</issue>
          <fpage>5228</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/14872004"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.0307752101</pub-id>
          <pub-id pub-id-type="medline">14872004</pub-id>
          <pub-id pub-id-type="pii">0307752101</pub-id>
          <pub-id pub-id-type="pmcid">PMC387300</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Steyvers</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Landauer</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>McNamara</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Dennis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kintsch</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Probabilistic topic models</article-title>
          <source>Handbook of Latent Semantic Analysis</source>
          <year>2007</year>
          <publisher-loc>New York, NY, USA</publisher-loc>
          <publisher-name>Psychology Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Airoldi</surname>
              <given-names>EM</given-names>
            </name>
          </person-group>
          <article-title>A model of text for experimentation in the social sciences</article-title>
          <source>J Am Stat Assoc</source>
          <year>2016</year>
          <month>10</month>
          <day>18</day>
          <volume>111</volume>
          <issue>515</issue>
          <fpage>988</fpage>
          <lpage>1003</lpage>
          <pub-id pub-id-type="doi">10.1080/01621459.2016.1141684</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paatero</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tapper</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Positive matrix factorization: a non-negative factor model with optimal utilization of error estimates of data values</article-title>
          <source>Environmetrics</source>
          <year>1994</year>
          <month>06</month>
          <volume>5</volume>
          <issue>2</issue>
          <fpage>111</fpage>
          <lpage>26</lpage>
          <pub-id pub-id-type="doi">10.1002/env.3170050203</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Seung</surname>
              <given-names>HS</given-names>
            </name>
          </person-group>
          <article-title>Learning the parts of objects by non-negative matrix factorization</article-title>
          <source>Nature</source>
          <year>1999</year>
          <month>10</month>
          <day>21</day>
          <volume>401</volume>
          <issue>6755</issue>
          <fpage>788</fpage>
          <lpage>91</lpage>
          <pub-id pub-id-type="doi">10.1038/44565</pub-id>
          <pub-id pub-id-type="medline">10548103</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Seung</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Algorithms for non-negative matrix factorization</article-title>
          <source>Proceedings of the 13th International Conference on Neural Information Processing Systems</source>
          <year>2000</year>
          <conf-name>NeurIPS '00</conf-name>
          <conf-date>January 1, 2000</conf-date>
          <conf-loc>Denver, CO, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Angelov</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>TOP2VEC: distributed representations of topics</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2008.09470"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grootendorst</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>BERTopic: neural topic modeling with a class-based TF-IDF procedure</article-title>
          <source>arXiv</source>
          <year>2022</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2203.05794"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berry</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Browne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Langville</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Pauca</surname>
              <given-names>VP</given-names>
            </name>
            <name name-style="western">
              <surname>Plemmons</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Algorithms and applications for approximate nonnegative matrix factorization</article-title>
          <source>Computation Stat Data Analysis</source>
          <year>2007</year>
          <month>9</month>
          <volume>52</volume>
          <issue>1</issue>
          <fpage>155</fpage>
          <lpage>73</lpage>
          <pub-id pub-id-type="doi">10.1016/j.csda.2006.11.006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Udell</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Horn</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zadeh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Generalized low rank models</article-title>
          <source>FNT Mach Learn</source>
          <year>2016</year>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>118</lpage>
          <pub-id pub-id-type="doi">10.1561/2200000055</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Gibbs sampling in the generative model of latent dirichlet allocation</article-title>
          <source>CiteSeerX</source>
          <access-date>2022-09-30</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.7.8022">https://citeseerx.ist.psu .edu/viewdoc/summary?doi=10.1.1.7.8022</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heinrich</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Parameter Estimation for Text Analysis: Technical Report</article-title>
          <source>University of Leipzig</source>
          <year>2008</year>
          <access-date>2022-11-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.arbylon.net/publications/text-est.pdf">http://www.ar bylon.net/publications/text-est.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Lafferty</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>A correlated topic model of Science</article-title>
          <source>Ann Appl Stat</source>
          <year>2007</year>
          <month>6</month>
          <day>1</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>17</fpage>
          <lpage>35</lpage>
          <pub-id pub-id-type="doi">10.1214/07-aoas114</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2</source>
          <year>2013</year>
          <conf-name>NeurIPS '13</conf-name>
          <conf-date>December 5-10, 2013</conf-date>
          <conf-loc>Lake Tahoe, NV, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Efficient estimation of word representations in vector space</article-title>
          <source>arXiv</source>
          <year>2013</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1301.3781"/>
          </comment>
          <pub-id pub-id-type="doi">10.3126/jiee.v3i1.34327</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yih</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zweig</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Linguistic regularities in continuous space word representations</article-title>
          <source>Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2013</year>
          <conf-name>NAACL '13</conf-name>
          <conf-date>June 9-14, 2013</conf-date>
          <conf-loc>Atlanta, GA, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>GloVe: global vectors for word representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2014</year>
          <conf-name>EMNLP '14</conf-name>
          <conf-date>October 26–28, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Bag of tricks for efficient text classification</article-title>
          <source>arXiv</source>
          <year>2016</year>
          <month>8</month>
          <day>6</day>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1607.01759.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/e17-2068</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Deep contextualized word representations</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1802.05365"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1202</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Akbik</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bergmann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Blythe</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rasul</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schweter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vollgraf</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>FLAIR: an easy to use framework for state of the art NLP</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (Demonstrations)</source>
          <year>2019</year>
          <conf-name>NAACL '19</conf-name>
          <conf-date>June, 2019</conf-date>
          <conf-loc>Minneapolis, MN, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1810.04805"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of sentences and documents</article-title>
          <source>arXiv</source>
          <year>2014</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1405.4053"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Limtiaco</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>St. John</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Constant</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Guajardo-Cespedes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tar</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>YH</given-names>
            </name>
            <name name-style="western">
              <surname>Strope</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kurzweil</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Universal sentence encoder</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1803.11175"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reimers</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gurevych</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Sentence-BERT: sentence embeddings using Siamese BERT-networks</article-title>
          <source>arXiv</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1908.10084"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1410</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T-Y</given-names>
            </name>
          </person-group>
          <article-title>MPNet: masked and permuted pre-training for language understanding</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2004.09297"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Healy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Melville</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>UMAP: uniform manifold approximation and projection for dimension reduction</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1802.03426"/>
          </comment>
          <pub-id pub-id-type="doi">10.21105/joss.00861</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Campello</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Moulavi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sander</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Density-based clustering based on hierarchical density estimates</article-title>
          <source>Proceedings of the 17th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining</source>
          <year>2013</year>
          <conf-name>PAKDD '13</conf-name>
          <conf-date>April 14-17, 2013</conf-date>
          <conf-loc>Gold Coast, Australia</conf-loc>
          <fpage>160</fpage>
          <lpage>72</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-642-37456-2_14"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-3-642-37456-2_14</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Both</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hinneburg</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Exploring the space of topic coherence measures</article-title>
          <source>Proceedings of the Eighth ACM International Conference on Web Search and Data Mining</source>
          <year>2015</year>
          <conf-name>WSDM '15</conf-name>
          <conf-date>February 2-6, 2015</conf-date>
          <conf-loc>Shanghai, China</conf-loc>
          <fpage>399</fpage>
          <lpage>408</lpage>
          <pub-id pub-id-type="doi">10.1145/2684822.2685324</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosner</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hinneburg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Roder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nettling</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Both</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluating topic coherence measures</article-title>
          <source>arXiv</source>
          <year>2014</year>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jaakkimainen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Upshur</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Klein-Geltink</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Leong</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Maaten</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schultz</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Primary Care in Ontario: ICES Atlas</article-title>
          <source>Institute for Clinical Evaluative Sciences</source>
          <year>2006</year>
          <month>11</month>
          <access-date>2022-11-07</access-date>
          <publisher-loc>Toronto, Canada</publisher-loc>
          <publisher-name>Institute for Clinical Evaluative Sciences</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ices.on.ca/~/media/Files/Atlases-Reports/2006/Primary-care-in-Ontario/Full-report.ashx">https://www.ices.on.ca/~/media/Files/Atlases-Reports/2006/Primary-care-in-Ontario/Full-report.ashx</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cichocki</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zdunek</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Phan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Amari</surname>
              <given-names>S-I</given-names>
            </name>
          </person-group>
          <source>Nonnegative Matrix and Tensor Factorizations: Applications to Exploratory Multi-Way Data Analysis and Blind Source Separation</source>
          <year>2009</year>
          <publisher-loc>Hoboken, NJ, USA</publisher-loc>
          <publisher-name>Wiley Online Library</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hoyer</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Non-negative matrix factorization with sparseness constraints</article-title>
          <source>J Mach Learn Res</source>
          <year>2004</year>
          <month>1</month>
          <day>12</day>
          <volume>5</volume>
          <fpage>1457</fpage>
          <lpage>69</lpage>
          <pub-id pub-id-type="doi">10.5555/1005332.1044709</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mackevicius</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bahle</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Denisenko</surname>
              <given-names>NI</given-names>
            </name>
            <name name-style="western">
              <surname>Denisenko</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Goldman</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised discovery of temporal sequences in high-dimensional datasets, with applications to neuroscience</article-title>
          <source>eLife</source>
          <year>2019</year>
          <volume>8</volume>
          <fpage>e38471</fpage>
          <pub-id pub-id-type="doi">10.7554/elife.38471</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lafferty</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Dynamic topic models</article-title>
          <source>Proceedings of the 23rd International Conference on Machine Learning</source>
          <year>2006</year>
          <conf-name>ICML '13</conf-name>
          <conf-date>June 25-29, 2006</conf-date>
          <conf-loc>Pittsburgh, PA, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kolda</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Bader</surname>
              <given-names>BW</given-names>
            </name>
          </person-group>
          <article-title>Tensor decompositions and applications</article-title>
          <source>SIAM Rev</source>
          <year>2009</year>
          <month>08</month>
          <day>06</day>
          <volume>51</volume>
          <issue>3</issue>
          <fpage>455</fpage>
          <lpage>500</lpage>
          <pub-id pub-id-type="doi">10.1137/07070111x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Churchill</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>The evolution of topic modeling</article-title>
          <source>ACM Comput Surv (forthcoming)</source>
          <year>2022</year>
          <month>01</month>
          <day>12</day>
          <fpage>2021</fpage>
          <pub-id pub-id-type="doi">10.1145/3507900</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Phung</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Huynh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Buntine</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Topic modelling meets deep neural networks: a survey</article-title>
          <source>arXiv</source>
          <year>2021</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2103.00498"/>
          </comment>
          <pub-id pub-id-type="doi">10.24963/ijcai.2021/638</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boyd-Graber</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mimno</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Applications of topic models</article-title>
          <source>FNT Inf Retrieval</source>
          <year>2017</year>
          <volume>11</volume>
          <issue>2-3</issue>
          <fpage>143</fpage>
          <lpage>296</lpage>
          <pub-id pub-id-type="doi">10.1561/1500000030</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
