<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e86877</article-id><article-id pub-id-type="doi">10.2196/86877</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>IdeaDistiller&#x2014;AI Support for Idea Synthesis in Concept Mapping: Algorithm Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Qwaider</surname><given-names>Chatrine</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Speicher</surname><given-names>Nora K</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Genell</surname><given-names>Anna E</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Holtenman</surname><given-names>Mikael</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Vaughn</surname><given-names>Lisa</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Smith</surname><given-names>Frida</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff6">6</xref></contrib></contrib-group><aff id="aff1"><institution>E-commons, Chalmers University of Technology</institution><addr-line>Chalmersplatsen 1</addr-line><addr-line>Gothenburg</addr-line><country>Sweden</country></aff><aff id="aff2"><institution>Department of Natural Language Processing, Mohamed bin Zayed University of Artificial Intelligence</institution><addr-line>Abu Dhabi</addr-line><country>United Arab Emirates</country></aff><aff id="aff3"><institution>Regional Cancer Centre West</institution><addr-line>Gothenburg</addr-line><country>Sweden</country></aff><aff id="aff4"><institution>Cincinnati Children&#x2019;s Hospital Medical Center, University of Cincinnati College of Medicine Department of Pediatrics</institution><addr-line>Cincinnati</addr-line><addr-line>OH</addr-line><country>United States</country></aff><aff id="aff5"><institution>Educational and Community-Based Action Research, University of Cincinnati College of Education, Criminal Justice, and Human Services</institution><addr-line>Cincinnati</addr-line><addr-line>OH</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Technology Management and Economics, Chalmers University of Technology</institution><addr-line>Gothenburg</addr-line><country>Sweden</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hackett</surname><given-names>Katie</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Grainger</surname><given-names>Matthew</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Nora K Speicher, PhD, E-commons, Chalmers University of Technology, Chalmersplatsen 1, Gothenburg, 412 96, Sweden, 46 732501659; <email>nora.speicher@chalmers.se</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>2</day><month>7</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e86877</elocation-id><history><date date-type="received"><day>11</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>31</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>10</day><month>06</month><year>2026</year></date></history><copyright-statement>&#x00A9; Chatrine Qwaider, Nora K Speicher, Anna E Genell, Mikael Holtenman, Lisa Vaughn, Frida Smith. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 2.7.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e86877"/><abstract><sec><title>Background</title><p>Concept mapping (CM) is a widely used mixed method research approach for structuring and visualizing complex ideas across various fields, such as the health sciences. A critical bottleneck in the CM process is the idea synthesis phase, which remains labor-intensive, subjective, and consequently challenging to scale for large datasets.</p></sec><sec><title>Objective</title><p>In this study, we propose IdeaDistiller, a semiautomated solution based on semantic clustering to optimize the idea synthesis step while maintaining methodological rigor through a human-in-the-loop approach.</p></sec><sec sec-type="methods"><title>Methods</title><p>Using 9 health care&#x2013;related datasets in English and Swedish, we systematically evaluated different embedding models, dimensionality reduction techniques, and clustering algorithms to identify robust and reproducible parameter settings for the proposed approach. IdeaDistiller clusters participant-generated ideas based on semantic similarity to identify similar ideas with different wording, suggests representative and unique ideas per cluster, and provides coherence scores and sorted outputs to aid manual validation.</p></sec><sec sec-type="results"><title>Results</title><p>Our findings suggest that IdeaDistiller may substantially reduce the manual effort involved in idea synthesis while preserving quality and transparency. However, human expertise remains indispensable for validating and refining cluster outputs.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Integrating semiautomated methods into the CM workflow offers significant potential for improving the efficiency, scalability, and rigor of the CM process. Building on our work will enable the exploration of larger multilingual datasets and integration into future CM studies.</p></sec></abstract><kwd-group><kwd>concept mapping</kwd><kwd>semantic clustering</kwd><kwd>topic modeling</kwd><kwd>bidirectional encoder representations topic modeling</kwd><kwd>BERTopic</kwd><kwd>human-in-the-loop</kwd><kwd>qualitative research automation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Concept mapping (CM) is a structured research method designed to visualize and organize thoughts, ideas, or knowledge concerning a specific topic by creating graphical representations of concepts and their interrelationships [<xref ref-type="bibr" rid="ref1">1</xref>]. Widely used in disciplines such as education, psychology, health sciences, and other social sciences, CM is an integrative mixed methods research methodology for exploring and analyzing complex conceptual domains. It includes both qualitative and quantitative data collection and analysis in sequential steps, where each step builds on the previous one, as described below [<xref ref-type="bibr" rid="ref2">2</xref>]. One of the strengths of this method is the active involvement of stakeholders throughout the process. Depending on the project and the research question, the participants can be more or less involved in all stages described in the following sections [<xref ref-type="bibr" rid="ref3">3</xref>].</p></sec><sec id="s1-2"><title>CM Methodology</title><p>After the initial preparation stage, the CM methodology typically comprises the following sequential stages, as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Normally, idea synthesis is part of idea generation, but for clarity in this study, it is defined here as an additional step in the process.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Process of concept mapping methodology.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86877_fig01.png"/></fig><sec id="s1-2-1"><title>Idea Generation</title><p>The research team formulates a focused research question, typically structured as an incomplete sentence or prompt. Participants individually complete this sentence, usually generating between 1 and 5 responses, referred to as ideas. Given the volume of data collected, a subsequent idea synthesis process is necessary to manage and refine the input so that the ideas are manageable for the sorting step.</p></sec><sec id="s1-2-2"><title>Idea Synthesis</title><p>Idea synthesis involves a series of refinement steps to produce a concise yet representative set of ideas for further analysis, referred to as the set of unique ideas for the study. This process includes eliminating identical or highly similar ideas, preserving unique and relevant contributions, removing off-topic ideas, and segmenting complex or compound ideas into more granular units. Additionally, grammatical and typographical corrections are applied to standardize the text. The overarching goal is to reduce the initial set of ideas to a manageable number, typically no more than 100 distinct ideas, while preserving the conceptual breadth and diversity of the original input [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p></sec><sec id="s1-2-3"><title>Structuring the Ideas</title><p>A subset of participants individually sorts the unique ideas into thematic categories, ensuring that each idea is placed in only 1 category. This process can be done manually or, as in most cases, using designated card-sorting software. Participants also assign descriptive labels to the categories they create.</p></sec><sec id="s1-2-4"><title>Representation</title><p>The sorting data are transformed into a similarity matrix that captures how often pairs of ideas are grouped. Multidimensional scaling is then applied to represent these similarities or dissimilarities on a 2D plane. The multidimensional scaling output is subjected to hierarchical cluster analysis to develop a hierarchy of clusters. The research team reviews successive cluster solutions to determine the optimal number of clusters, selecting the most meaningful and interpretable configuration while balancing sufficient yet manageable detail [<xref ref-type="bibr" rid="ref5">5</xref>]. Depending on the project, participants can also be part of this stage, as well as in the final step, interpretation.</p></sec><sec id="s1-2-5"><title>Interpretation</title><p>Clusters are labeled based on the category names suggested during sorting. The final concept map visually presents the ideas&#x2019; spatial coordinates and delineated clusters, offering a structured and interpretable visualization of the conceptual domains.</p><p>The principal objective of CM is to provide a clear, visual representation of a conceptual space and set of ideas, thereby enhancing the understanding of underlying structures and relationships among the ideas. A defining characteristic of CM is the active engagement of stakeholders at multiple stages of the research process, including data collection, analysis, and interpretation, making it highly compatible with participatory research methodologies [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Despite its structured framework, CM poses a number of challenges. Criticisms, even from the originators, include the potential for subjectivity and bias, particularly during qualitative phases, as well as the method&#x2019;s high demands on time and resources for researchers and participants [<xref ref-type="bibr" rid="ref6">6</xref>]. The idea synthesis phase is particularly concerning, as it involves refining, consolidating, and preparing ideas for subsequent sorting and analysis [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>While the core CM methodology is well-established, the growth of large-scale, digitally mediated data collections necessitates more efficient, rigorous, and partially automated approaches to the idea synthesis step.</p></sec></sec><sec id="s1-3"><title>Related Work</title><p>The idea synthesis phase in CM has traditionally relied on manual, labor-intensive procedures conducted by research teams through iterative consensus-building exercises. A range of approaches has been documented in the literature. For instance, Ashe et al [<xref ref-type="bibr" rid="ref7">7</xref>] describe a multistage process in which individual review of items was followed by several collaborative meetings to finalize the list of unique ideas. Similarly, Windsor [<xref ref-type="bibr" rid="ref8">8</xref>] employed 5 independent coders to identify redundancies, after which consensus was reached on the items to retain. Hassmiller Lich et al [<xref ref-type="bibr" rid="ref9">9</xref>] introduced a more advanced strategy by comparing an initial set of 830 ideas with a final curated set of 97 to assess representativeness. Pantha et al [<xref ref-type="bibr" rid="ref2">2</xref>] conducted a review indicating that while approximately one-third of studies provided an explicit rationale for their sample size, three-quarters described their idea synthesis process in detail. However, concerns about methodological transparency remain. For example, McLinden [<xref ref-type="bibr" rid="ref10">10</xref>] criticized the often insufficient reporting of synthesis procedures, arguing that such omissions hinder the ability to evaluate the representativeness and validity of the final idea sets. These studies underscore the variability in practice and the need for more standardized and transparent approaches to idea synthesis in CM research.</p><p>Despite growing awareness of the importance of rigorous idea synthesis, inconsistencies persist in methodological reporting and implementation. Even though digital technologies have enabled larger-scale data collection, they have also further compounded the challenges and increased the burden on researchers to conduct transparent and methodologically sound synthesis. While Kane and Trochim [<xref ref-type="bibr" rid="ref1">1</xref>] emphasized that CM relies on a combination of technological support and human qualitative judgment&#x2014;a &#x201C;high-tech, high-touch&#x201D; approach&#x2014;there is a growing need for methods that blend automation with interpretive rigor without compromising the qualitative depth of the process.</p><p>To address all the aforementioned challenges, this study presents the development and evaluation of IdeaDistiller, a semiautomated tool designed to support the idea synthesis phase of CM. By leveraging machine learning techniques, our approach aims to reduce researchers&#x2019; burden, enhance methodological transparency, and preserve qualitative depth. We demonstrate the tool&#x2019;s utility through a series of exploratory benchmarks across diverse English and Swedish datasets, providing a framework for more efficient and reproducible conceptual analysis in large-scale studies.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Design Rationale and Requirements</title><p>Recognizing that idea synthesis is inherently a qualitative task requiring nuanced judgment, we adopted a human-in-the-loop (HITL) approach, integrating automated techniques with expert oversight. This strategy aligns with the recommendation by Kane and Trochim [<xref ref-type="bibr" rid="ref1">1</xref>] that, while technology can support the CM process, human interpretive engagement remains essential. We acknowledge that the distillation of ideas is not a discovery of an objective &#x201C;ground truth&#x201D; but rather a subjective interpretive process in which granularity and categorization depend on the researcher&#x2019;s perspective. Overall, we aimed at developing a tool that fulfills the following requirements:</p><list list-type="bullet"><list-item><p>Redundancy reduction: identifying and eliminating duplicate or semantically similar ideas to minimize repetition within the idea set.</p></list-item><list-item><p>Semantic alignment: assessing and organizing ideas by measuring their semantic similarity to each other.</p></list-item><list-item><p>Preservation of unique contributions: safeguarding novel and valuable ideas to maintain the richness and diversity of participants&#x2019; input.</p></list-item></list><p>Our approach is based on the principle that all ideas can be clustered into semantically coherent groups, where each group consists of ideas conveying the same or a similar idea in different formulations, and the main idea for each group is distinct from all other groups. Creating such a clustering enables the researcher to subsequently extract one representative idea from each group, ensuring that all extracted ideas collectively represent the diversity of the complete dataset.</p><p>We explored several approaches to clustering ideas based on their semantic similarity. In the following sections, we describe the data, clustering methods, and evaluation metrics used, as well as the output created to facilitate the easy integration of human feedback.</p></sec><sec id="s2-2"><title>Datasets</title><p>This study used data from various independent CM studies. Each dataset comprised two components: (1) the complete set of raw ideas provided by participants and (2) the manually identified set of unique ideas, which served as the reference for evaluating the performance of the computational methods developed.</p><p>In total, 9 datasets were analyzed: 6 in English and 3 Swedish datasets. The English datasets cover diverse topics within the health and health care domains, including obesity, stress, medical services [<xref ref-type="bibr" rid="ref11">11</xref>], drug abuse [<xref ref-type="bibr" rid="ref12">12</xref>], and aspects of suicide screening [<xref ref-type="bibr" rid="ref13">13</xref>]. The datasets covering the first 3 topics originally contained Spanish ideas collected from Latino participants in Cincinnati; however, their English translations were used in this study. The Swedish datasets addressed cancer rehabilitation [<xref ref-type="bibr" rid="ref14">14</xref>], the support structures available to patients and their families (the &#x201C;Kraftens hus&#x201D; initiative) [<xref ref-type="bibr" rid="ref15">15</xref>], and the use of development plans as a tool in patient care [<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>Before clustering, exact duplicate entries, which were particularly frequent among shorter ideas, were removed as they would not contribute additional unique ideas. However, within our approach, it is possible to retain duplicates to allow for a quantitative follow-up analysis.</p><p><xref ref-type="table" rid="table1">Table 1</xref> provides an overview of the datasets, along with their sizes and the number of manually identified unique ideas. The datasets vary in thematic focus, language, participant demographics, and dataset size, providing a robust basis for evaluating the generalizability and performance of the proposed computational approach.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Datasets used in the study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Language</td><td align="left" valign="bottom">Original ideas<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">Unique ideas</td></tr></thead><tbody><tr><td align="left" valign="top">Obesity</td><td align="left" valign="top">English</td><td align="left" valign="top">406</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">Stress</td><td align="left" valign="top">English</td><td align="left" valign="top">670</td><td align="left" valign="top">97</td></tr><tr><td align="left" valign="top">Medical services</td><td align="left" valign="top">English</td><td align="left" valign="top">697</td><td align="left" valign="top">96</td></tr><tr><td align="left" valign="top">Drug abuse</td><td align="left" valign="top">English</td><td align="left" valign="top">162</td><td align="left" valign="top">75</td></tr><tr><td align="left" valign="top">Results of suicide screening</td><td align="left" valign="top">English</td><td align="left" valign="top">462</td><td align="left" valign="top">80</td></tr><tr><td align="left" valign="top">Important parts of suicide screening</td><td align="left" valign="top">English</td><td align="left" valign="top">415</td><td align="left" valign="top">80</td></tr><tr><td align="left" valign="top">Cancer rehabilitation</td><td align="left" valign="top">Swedish</td><td align="left" valign="top">525</td><td align="left" valign="top">67</td></tr><tr><td align="left" valign="top">Cancer support center</td><td align="left" valign="top">Swedish</td><td align="left" valign="top">121</td><td align="left" valign="top">72</td></tr><tr><td align="left" valign="top">Development plans</td><td align="left" valign="top">Swedish</td><td align="left" valign="top">375</td><td align="left" valign="top">100</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Number of original ideas after the removal of duplicates.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Semantic Clustering</title><p>Semantic similarity identifies latent patterns within textual datasets by uncovering recurring groups of words that represent underlying themes or topics. Traditional methods, such as latent Dirichlet allocation [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], primarily rely on word frequency statistics to discover topics, which makes them limited in their ability to find semantic similarity in sentences with very different wording. Therefore, our approach builds on the BERTopic (bidirectional encoder representations topic modeling) framework [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>], a technique that incorporates textual relationships and semantic meaning.</p><p>BERTopic follows a flexible pipeline comprising several steps. Since our primary interest lies in grouping each idea into a semantically coherent cluster, we focus on the first three stages of the pipeline: (1) embedding the ideas into a dense vector space, (2) reducing the dimensionality of the embedding space, and (3) clustering the embedded original ideas to form semantically related groups.</p><p>The core principle of BERTopic is embedding sentences into a high-dimensional embedding space to transform textual data into dense vector representations before clustering them. Semantic similarities between texts are reflected by their proximity in the vector space. These embeddings allow for a more nuanced understanding of similarity beyond simple word counts. In this study, we used several different pretrained embedding models, adapted to different languages, to create the sentence embeddings.</p><p>Through this process, all ideas are assigned to clusters, each containing original ideas with similar semantic meanings. CM researchers can then extract one or more representative ideas from each cluster. Each of the 3 mentioned steps offers multiple methodological choices, and we selected the specific methods that showed the most robust results across our exploratory study. While this process provides a good starting point, manual refinement is still needed. To simplify this task for the researcher, we additionally sort all ideas based on their cluster assignment and the semantic similarity between the clusters; that is, we organize the clusters such that those covering similar themes are positioned closer together in the output. To achieve this, we applied hierarchical clustering to the topic representations and used the resulting dendrogram to order the output.</p></sec><sec id="s2-4"><title>Evaluation Metrics</title><p>Standard evaluation metrics for topic modeling, such as coherence scores and perplexity, assess the semantic consistency and interpretability of the identified topics. In our case, however, the availability of manually curated unique ideas from previous CM studies enabled more targeted evaluation measures. Rather than measuring &#x201C;correctness&#x201D; in an absolute sense, these metrics quantify the degree of alignment between the computational output and a specific, human-led reference synthesis. To quantify the performance of IdeaDistiller, we developed 2 coverage-based metrics that assess the alignment between the computationally generated clusters (without human refinement) and the expert-identified unique ideas. These metrics are based on the heuristic that an optimal alignment with the reference standard would produce as many clusters as there are unique ideas, with 1 unique idea representing each cluster. Therefore, the metrics used consider the overlap between computational clustering and curated unique ideas, which is the number of clusters containing at least 1 unique idea. The 2 metrics are defined as follows:</p><disp-formula id="E5"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Coverage of clusters to unique ideas</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>overlap</mml:mtext><mml:mrow><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">_</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">q</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">_</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">s</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where n_unique_ideas represents the total number of manually curated unique ideas. This metric evaluates the proportion of unique ideas represented by the computational clusters. In other words, this metric tells us how many of the unique ideas we would obtain if we extracted exactly 1 unique idea from each cluster and how many this computational approach would miss:</p><disp-formula id="E3"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Proportion of clusters with unique ideas</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>overlap</mml:mtext><mml:mrow><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">_</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">s</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where n_clusters is the total number of computationally generated clusters. This metric assesses the proportion of clusters containing at least 1 manually identified unique idea, highlighting the extent to which the computational clusters align with expert curation. A value below 1 indicates clusters without a unique idea, which can be either due to suboptimal clustering (ie, the additional clusters do not consist of new ideas but should be a part of another cluster containing a unique idea) or due to the clustering identifying new aspects that the human evaluator has not considered.</p><p>As an example, consider a scenario with 100 clusters and 96 unique ideas, where 62 clusters contain at least 1 unique idea. In this case, the 2 metrics are calculated as follows:</p><disp-formula id="E4"><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Proportion of Clusters with Unique Ideas</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mn>62</mml:mn><mml:mn>100</mml:mn></mml:mfrac><mml:mo>=</mml:mo><mml:mn>0.62</mml:mn></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E7"><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Coverage of Clusters to Unique Ideas</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mn>62</mml:mn><mml:mn>96</mml:mn></mml:mfrac><mml:mo>&#x2248;</mml:mo><mml:mn>0.65</mml:mn></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>This example indicates that 38% (38/100) of clusters do not contain a manually curated unique idea, while 35.41% (34/96) of unique ideas are not captured when extracting 1 idea per cluster. Together, these metrics provide a comprehensive assessment of the representativeness and completeness of the clustering results. It is important to note that a lack of perfect correspondence (values below 1) does not necessarily indicate &#x201C;error&#x201D; by the model. Because unique ideas in CM are interpretive and dependent on the researcher&#x2019;s chosen level of granularity, discrepancies may represent valid alternative structures. Therefore, these metrics should be interpreted as a measure of how closely the IdeaDistiller mimics the specific synthesis decisions made by previous expert researchers.</p><p>While these 2 metrics aim to evaluate the results from 2 different perspectives, we applied the pipeline by choosing the number of clusters to be equal to the number of manually curated unique ideas, which resulted in identical values for both metrics. Therefore, in the following, we will report only one of the metrics.</p></sec><sec id="s2-5"><title>Topic Coherence</title><p>Although coherence was not used to select the best-performing model, we computed a coherence score for each idea to support researchers during the manual evaluation phase. In our setting, coherence measures the degree of semantic similarity among all ideas within the same cluster, with higher coherence indicating greater internal consistency. This score can help assess how well an idea fits within its assigned cluster, allowing for special focus on ideas with low coherence scores during the postcomputational or refinement phase. In practice, we computed embeddings for all individual ideas using a pretrained sentence-BERT (bidirectional encoder representations from transformers) model [<xref ref-type="bibr" rid="ref22">22</xref>]. The coherence score of each idea is then calculated as the cosine similarity between its embedding and the mean embedding of all ideas in its assigned cluster.</p></sec><sec id="s2-6"><title>Generated Output</title><p>Our approach, IdeaDistiller, produces an output file designed to support CM researchers during the idea synthesis phase. This file includes a comprehensive table listing all participant ideas alongside the results of the computational analysis, facilitating manual review and refinement.</p><p>The output contains the following components:</p><list list-type="bullet"><list-item><p>Cluster labels: each idea is accompanied by an assigned cluster label, indicating the group to which the idea belongs based on semantic similarity.</p></list-item><list-item><p>Sorting: to aid manual evaluation, the ideas are sorted according to their cluster assignments (ie, similar ideas should appear close to each other in the file), and clusters are organized based on semantic similarity. This structure enables researchers to easily assess whether adjacent clusters are sufficiently similar to warrant merging, potentially representing them with a common, unique idea.</p></list-item><list-item><p>Suggested unique ideas: the file provides a computationally suggested representative idea for each cluster. When available, original, manually curated, and unique ideas are also included for comparison. This feature is useful when applying the tool to previously filtered datasets or when evaluating the alignment between manual judgments and computational outputs.</p></list-item><list-item><p>Coherence scores: semantic coherence scores are provided for each idea, offering insights into the internal consistency and quality of the clustering process. A low coherence score for an idea may indicate a poor fit within its assigned cluster and can prompt further review and adjustment.</p></list-item></list><p>These outputs offer researchers a structured foundation for conducting a comprehensive evaluation and refinement of the results to finalize the idea synthesis process.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>All projects that contributed data for this work have been ethically assessed and/or approved. Further details can be found in the respective manuscripts [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. For this study, permission to use the data was granted by the data collectors, and no actual outcomes are reported. The data were used exclusively for the development and testing of the tool and are neither required for its further use nor can they be retrieved from the tool.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>As outlined in the <italic>Datasets</italic> section, the datasets used vary in characteristics such as size, topic, language, and participant demographics. This diversity allows us to evaluate the usefulness and generalizability of the proposed approach under diverse conditions.</p><sec id="s3-1"><title>Experiments and Parameter Selection</title><sec id="s3-1-1"><title>Sensitivity Analysis and Parameter Exploration</title><p>Given that a definitive &#x201C;ground truth&#x201D; for idea synthesis is rarely available in real-world research contexts, our experimental grid is intended as a sensitivity analysis. We aimed to demonstrate the robustness of the IdeaDistiller framework across a wide range of configurations, rather than to nominate a single &#x201C;optimal&#x201D; model for all future applications. We aimed to maintain a one-to-one correspondence between clusters and unique ideas and selected the number of clusters to match the number of unique ideas in each dataset using the metrics described above.</p><p>Using BERTopic, we explored different parameter configurations for the three main components of the computational pipeline: (1) embedding models, (2) dimensionality reduction methods, and (3) clustering algorithms.</p><p>The specific parameters and methods tested are summarized in <xref ref-type="table" rid="table2">Table 2</xref>. The total number of experiments conducted per dataset is as follows:</p><disp-formula id="E6"><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Total experiments</mml:mtext><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mtext>Number of clustering algorithms</mml:mtext><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mtext>Number of embedding models</mml:mtext><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mtext>Number of dimensionality reduction methods</mml:mtext><mml:mo>&#x00D7;</mml:mo><mml:mtext>Number of dimensions</mml:mtext><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Accordingly, the datasets for the experiment are as follows:</p><list list-type="bullet"><list-item><p>For the English datasets: 3&#x00D7;3&#x00D7;(2&#x00D7;9)=162 experiments per dataset</p></list-item><list-item><p>For the Swedish datasets: 3&#x00D7;2&#x00D7;(2&#x00D7;9)=108 experiments per dataset</p></list-item></list><p>This allowed us to assess the impact of different parameters and evaluate performance under diverse configurations. Our results showed that no single parameter configuration consistently yielded the best performance across all datasets, reflecting the heterogeneity of the datasets.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Overview of the different parameter settings that were tested.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameters</td><td align="left" valign="bottom">Options</td></tr></thead><tbody><tr><td align="left" valign="top">Clustering</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>K-means clustering, agglomerative clustering, and spectral clustering.</p></list-item></list></td></tr><tr><td align="left" valign="top">Embedding</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>English: fastText<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, SentenceTransformer, Flair<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></p></list-item><list-item><p>Swedish: KB-BERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup>, SentenceTransformer</p></list-item></list></td></tr><tr><td align="left" valign="top">Dimensionality reduction</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Truncated SVD<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>, PCA<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> (with dimensions varied from n=2 to n=10)</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>fastText: Facebook artificial intelligence word embedding model.</p></fn><fn id="table2fn2"><p><sup>b</sup>Flair: natural language processing embedding framework.</p></fn><fn id="table2fn3"><p><sup>c</sup>KB-BERT: Kungliga Biblioteket bidirectional encoder representations from transformers.</p></fn><fn id="table2fn4"><p><sup>d</sup>SVD: singular value decomposition.</p></fn><fn id="table2fn5"><p><sup>e</sup>PCA: principal component analysis.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-1-2"><title>Clustering Algorithm Selection</title><p>We evaluated multiple clustering algorithms to determine the most suitable method for our approach. We explored hierarchical density&#x2013;based spatial clustering of applications with noise due to its ability to discover clusters of varying densities without requiring a predefined number of clusters. However, initial trials resulted in very high numbers of clusters and outliers, reflecting the complexity of the dataset. We, therefore, concluded that the absence of explicit control over the number of clusters rendered hierarchical density&#x2013;based spatial clustering of applications with noise less appropriate for our application.</p><p>Overall, we compared the performance of 3 clustering algorithms across all datasets: k-means, agglomerative clustering (with Ward linkage), and spectral clustering.</p><p>As illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>, spectral clustering appeared to show less stable performance and tended, on average, to yield lower results compared with the other methods. In contrast, k-means and agglomerative clustering seemed to perform at broadly similar levels across the experiments. These impressions are based on visual inspection of the box plots and should, therefore, be interpreted with appropriate caution.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Box plots of the 3 clustering algorithms (k-means, agglomerative clustering, and spectral clustering) showing their proportions of clusters with unique ideas.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86877_fig02.png"/></fig><p>We suggest agglomerative clustering as a practical baseline configuration for the pipeline. While k-means is a robust alternative, its sensitivity to random initialization can yield varying outcomes across different runs unless specific measures&#x2014;such as fixed seeds or multiple restarts&#x2014;are implemented. We prioritized agglomerative clustering primarily for its inherent determinism, as well as its established use within the field, contributing to its acceptance among researchers. In the following sections, we report the performance based on agglomerative clustering only, excluding the other clustering approaches.</p></sec><sec id="s3-1-3"><title>Embedding Models</title><p>Embedding models are used to represent sentences as numerical vectors within a high-dimensional space, where the proximity between vectors captures semantic similarity. Various embedding models exist, often tailored to specific languages, which can significantly influence clustering performance.</p><p>For the English datasets, we evaluated 3 models: fastText, SentenceTransformer (all-MiniLM-L6-v2), and Flair. For the Swedish datasets, we tested KB-BERT (Kungliga Biblioteket bidirectional encoder representations from transformers; bert-base-swedish-cased) and SentenceTransformer (paraphrase-multilingual-MiniLM-L12-v2).</p><p>Our experiments demonstrated that, for the available English datasets, the SentenceTransformer (all-MiniLM-L6-v2) showed the highest mean performance within this exploratory study, as shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, yielding better alignment with the manually curated unique ideas. For our Swedish datasets, KB-BERT (bert-base-swedish-cased) showed the highest performance; however, given the small number of Swedish datasets, these results should be interpreted as indicative rather than definitive (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Nevertheless, we selected KB-BERT because its training on Swedish-specific corpora is expected to improve its ability to capture linguistic nuances unique to the language.</p><p>In the following sections, we report the performance of the results based on agglomerative clustering combined with the selected embedding models.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Box plots of embedding models showing their proportion of clusters with unique ideas using agglomerative clustering on the English and Swedish languages. fastText: Facebook artificial intelligence word embedding model; Flair: natural language processing embedding framework; MiniLM: miniature language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86877_fig03.png"/></fig></sec><sec id="s3-1-4"><title>Dimensionality Reduction</title><p>Dimensionality reduction is applied to high-dimensional embeddings to reduce computational complexity while preserving the most relevant semantic information. Different techniques emphasize the preservation of distinct aspects of the original data structure, which has an impact on downstream clustering performance.</p><p>Although uniform manifold approximation and projection is the default technique in BERTopic and often yields high-quality embeddings, we excluded it from our experiments. While stochastic methods like uniform manifold approximation and projection can be made reproducible through fixed random seeds, we opted for deterministic alternatives (truncated singular value decomposition [SVD] and principal component analysis) to ensure consistent results across different computational environments and versions. This choice was made to prioritize practical deployment and ease of use, and does not imply that nondeterministic methods are theoretically inferior for this task.</p><p>Each method was evaluated across a range of dimensions (from n=2 to n=10), with results summarized in <xref ref-type="fig" rid="figure4">Figures 4 to 6</xref><xref ref-type="fig" rid="figure5"/>-<xref ref-type="fig" rid="figure6">6</xref>. Language-specific analyses revealed that for our English datasets, the best performance seemed to be achieved by using truncated SVD with a higher dimensionality (n<italic>=</italic>10), whereas for the available Swedish datasets, truncated SVD with a lower dimensionality (n=6) showed, on average, slightly better results. However, given the small number of datasets analyzed (6 in English and 3 in Swedish) and the associated variability in the results, we recommend using truncated SVD with a lower dimensionality (n=6) across both languages. This setting seems to provide more robust and stable performance across all the datasets, whereas a higher dimensionality (n=10) exhibited greater variance in clustering quality when considering both English and Swedish datasets.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Box plots of dimensionality reduction methods (truncated SVD and PCA) with varying numbers of dimensions, showing their proportion of clusters with unique ideas for the English dataset. PCA: principal component analysis; SVD: singular value decomposition.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86877_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Box plots of dimensionality reduction methods (truncated SVD and PCA) with varying numbers of dimensions, showing their proportion of clusters with unique ideas for the Swedish dataset. PCA: principal component analysis; SVD: singular value decomposition.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86877_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Box plots of dimensionality reduction methods (truncated SVD and PCA) with varying numbers of dimensions, showing their proportion of clusters with unique ideas for both languages. PCA: principal component analysis; SVD: singular value decomposition.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86877_fig06.png"/></fig><p>With these selected methods, the average performance of IdeaDistiller is 0.63 for English datasets and 0.67 for Swedish datasets, meaning that 63% (n=100) and 67% (n=100) of clusters, respectively, contain at least 1 manually curated unique idea, while 37% (33% for Swedish) of the clusters do not contain a unique idea. The latter can be due to suboptimal cluster structure; that is, those clusters could be merged with other existing clusters; however, in some cases, these clusters might provide new, additional perspectives on the topic. These results indicate an existing correlation between manual and computational results, meaning that IdeaDistiller can support the researcher in their analysis. At the same time, they also showcase that the output of IdeaDistiller will need human revision to ensure methodological rigor. The clustered and sorted lists of original ideas produced by IdeaDistiller therefore represent an intermediate step in the idea synthesis process. While this tool does not provide a final solution for idea synthesis, it offers structural assistance and reduces cognitive load on the researchers.</p></sec><sec id="s3-1-5"><title>Number of Clusters</title><p>In our previous experiments, the number of clusters, <italic>k,</italic> was always set to the number of manually curated unique ideas to evaluate how well IdeaDistiller reproduces these results. With a new CM study, however, this value will not be available, but the researcher might instead use a few standard values. To investigate the effect of different cluster numbers on the results, we varied <italic>k</italic> between 70 and 110 while keeping the suggested options for all other parameters. The results were evaluated using both the proportion of clusters with unique ideas (called proportion) and the coverage of clusters to unique ideas (called coverage), as well as the coherence score, which is independent of the manually curated unique ideas. <xref ref-type="fig" rid="figure7">Figure 7</xref> shows that, due to the way they were constructed, proportion and coverage have an inverse relationship, crossing at the true value of manually curated ideas. The effect of a change in <italic>k</italic> depends on the dataset. Within the considered range of <italic>k</italic>, the change in the proportion score lies in the range of 0.12 to 0.18, while the coverage score shows slightly higher differences, ranging from 0.1 to 0.21 across our datasets. The average coherence score for each dataset monotonically increases with the number of clusters. This is expected, since it measures how closely connected the ideas within a cluster are, with more clusters allowing for better separation of the different aspects covered by the dataset. Based on these results, we generally suggest choosing <italic>k</italic> slightly higher than the number of unique ideas one is aiming for. This way, the user obtains more coherent clusters. At the same time, the clusters become smaller and possibly overlapping in content. To address this, IdeaDistiller provides an output file where the clusters are sorted semantically, such that similar clusters are grouped together and can easily be identified when reviewing the list.</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Impact of cluster number (<italic>k</italic>) on proportion, coverage, and coherence scores across 9 datasets.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86877_fig07.png"/></fig></sec></sec><sec id="s3-2"><title>Coherence Calculation</title><p>To provide a concrete example of how coherence scores reflect the semantic unity of the output, <xref ref-type="table" rid="table3">Table 3</xref> illustrates a cluster (ID 86) from the English dataset &#x201C;Stress.&#x201D; The scores represent the cosine similarity between individual statements and the cluster centroid. Higher scores (eg, 0.76) correspond to statements that align closely with the core topic, while lower scores (eg, 0.52) indicate less closely related concepts like &#x201C;support.&#x201D; This allows researchers to quickly validate the thematic consistency of the tool&#x2019;s output and identify potential outliers.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Representative clusters and associated coherence scores, illustrating semantic proximity to the cluster centroid.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Idea</td><td align="left" valign="bottom">Coherence score</td></tr></thead><tbody><tr><td align="left" valign="top">Look for more information</td><td align="left" valign="top">0.76</td></tr><tr><td align="left" valign="top">Have more information</td><td align="left" valign="top">0.70</td></tr><tr><td align="left" valign="top">More information</td><td align="left" valign="top">0.60</td></tr><tr><td align="left" valign="top">More support</td><td align="left" valign="top">0.52</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Illustrative Case Study</title><p>We conducted a small audit on one of the English datasets (&#x201C;Drug abuse&#x201D;) to gain insights into the semantic clusters without unique ideas. We found that, in many cases, these clusters did not contain novel content but slight variations of already selected ideas. However, some of these clusters contained ideas that could be deemed &#x201C;novel&#x201D; depending on the focus of the study. One example is 3 statements on background checks (&#x201C;give background checks when you apply for a license,&#x201D; &#x201C;background checks on patients,&#x201D; and &#x201C;ask for consensual background checks to know if a person is on drugs or has experienced&#x2026;&#x201D;), with none of them being included in the list of unique ideas. For the given study, these ideas were considered out of scope and at the same time loosely related to 2 retained unique ideas (&#x201C;The federal government should run a program that takes convicted drug dealers/users into mandatory rehab&#x201D; and &#x201C;Hold addicts accountable for staying clean and away from drugs&#x201D;). However, depending on the focus of the study, an idea specifically about background checks could be a useful addition. This example illustrates how scope and granularity decisions shape what is included as a unique idea and shows that the tool provides a valuable starting point for idea synthesis. However, it remains independent of these study-specific decisions and therefore does not replace researcher judgment.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study shows that semantic clustering can be used to support the idea synthesis process in a CM study. For 9 different datasets, IdeaDistiller was able to reproduce expert synthesis with an average alignment of 64%, highlighting both the value of the tool in organizing the original ideas as well as the need for expert involvement in the curation process. The latter is facilitated by providing sorted lists of all original ideas, based on which the results can easily be adjusted.</p><p>Topic modeling, specifically BERTopic, has been successfully applied in various contexts, such as Twitter (subsequently rebranded X) tweets [<xref ref-type="bibr" rid="ref20">20</xref>]. Our study extends these findings and demonstrates the potential to enhance the CM process, specifically the idea synthesis step, using various health care&#x2013;related datasets. IdeaDistiller, our proposed approach, significantly reduces the manual effort traditionally required to identify and group similar ideas by automating the clustering and thereby presorting the participant-generated ideas. This automation enables researchers to allocate more time to the subsequent analysis and interpretation of ideas rather than the manual data processing phase.</p><p>Moreover, the approach provides additional outputs, including coherence scores and semantically sorted clusters, which facilitate a more structured and efficient review process. IdeaDistiller is designed to support CM researchers, while keeping them in control of their own analysis. Extracting final, unique ideas from each cluster still necessitates human judgment, and suggested representative ideas require expert validation, particularly in complex or sensitive domains such as health care.</p><p>Our findings align with Prescott et al [<xref ref-type="bibr" rid="ref23">23</xref>], who argue that while generative artificial intelligence (AI) can significantly reduce time and resource demands, hybrid approaches incorporating HITL validation remain necessary. Similarly, Morgan [<xref ref-type="bibr" rid="ref24">24</xref>] concludes that although AI tools demonstrate great promise for supporting coding tasks, integrating AI with qualitative data analysis presents challenges. In agreement with Prescott et al [<xref ref-type="bibr" rid="ref23">23</xref>], human analysts retain a crucial advantage in identifying nuanced meanings and interpreting context-specific themes&#x2014;an essential skill for rigorous qualitative research.</p><p>Integrating an automated step into the idea synthesis process offers substantial benefits for methods such as CM. These include improved scalability for handling larger datasets. We also concur with Chen et al [<xref ref-type="bibr" rid="ref25">25</xref>], who advocate for using machine learning to identify inconsistencies and ambiguities in qualitative coding. By automating the simpler stages of coding, researchers can allocate their expertise more efficiently to the complex interpretative tasks that require theoretical sensitivity and contextual knowledge&#x2014;an outcome facilitated by the tool developed in this study.</p></sec><sec id="s4-2"><title>Limitations</title><p>A limitation encountered in this study was the variability in dataset sizes and languages, which posed challenges for optimal parameter tuning. Smaller datasets constrained the ability of machine learning models to generalize effectively. Nevertheless, by using stable parameter settings, we achieved consistent and satisfactory performance across most datasets.</p><p>While a HITL approach remains indispensable for analyzing complex qualitative data, developing systems that support and enhance this collaboration is a promising direction for the future evolution of the CM methodology. Further application of the approach across diverse contexts will be necessary to fully understand its capabilities and identify opportunities for improvement.</p><p>In summary, our findings suggest that IdeaDistiller can significantly streamline a previously time-consuming step of the CM process, improve scalability, and maintain reasonable accuracy. Nonetheless, human validation remains essential to ensure the quality and relevance of results in real-world applications.</p></sec><sec id="s4-3"><title>Future Work</title><p>In this study, we created IdeaDistiller, a tool trained on previous CM studies. An important next step will be to use and test the tool in further real-life studies across different domains to enable continued evaluation with respect to both usability and the experience of time-saving.</p><p>Additionally, advances in explainable AI could be leveraged to improve the interpretability of clustering results by providing insights into how and why specific items are grouped together. For instance, techniques such as attention heatmaps, feature importance scores, or natural language explanations could help domain experts understand the rationale behind each cluster, thereby enhancing transparency, supporting critical evaluation, and increasing trust in the system&#x2019;s outputs.</p><p>Overall, the continued development of semiautomated tools, coupled with human expertise, presents a compelling path forward for improving the efficiency, scalability, and methodological rigor of CM and other qualitative research methodologies.</p></sec></sec></body><back><ack><p>The authors declare the use of generative artificial intelligence in the writing process. ChatGPT was used under full human supervision to refine, correct, or edit the draft to improve language clarity. Napkin.ai was used to generate <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p></ack><notes><sec><title>Funding</title><p>This research was partly funded by the Regional Cancer Center West.</p></sec><sec><title>Data Availability</title><p>The data used to evaluate the performance is not publicly available due to ethical restrictions related to participant confidentiality. The code for IdeaDistiller is publicly available [<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>FS, AEG, MH, and LV provided insights into the CM methodology and contributed datasets. All authors contributed to the experimental design of the study. CQ and NKS performed data preprocessing, designed and implemented IdeaDistiller, and conducted the experiments. FS, AEG, MH, CQ, and NKS were involved in the testing and analysis of IdeaDistiller. All authors contributed to the writing and editing of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb3">BERTopic</term><def><p>bidirectional encoder representations topic modeling</p></def></def-item><def-item><term id="abb4">CM</term><def><p>concept mapping</p></def></def-item><def-item><term id="abb5">HITL</term><def><p>human-in-the-loop</p></def></def-item><def-item><term id="abb6">KB-BERT</term><def><p>Kungliga Biblioteket bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb7">SVD</term><def><p>singular value decomposition</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kane</surname><given-names>M</given-names> </name><name name-style="western"><surname>Trochim</surname><given-names>WMK</given-names> </name></person-group><source>Concept Mapping for Planning and Evaluation</source><year>2007</year><publisher-name>SAGE Publications</publisher-name><pub-id pub-id-type="doi">10.4135/9781412983730</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pantha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gartoulla</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gray</surname><given-names>R</given-names> </name></person-group><article-title>A systematic review to inform the development of a reporting guideline for concept mapping research</article-title><source>Methods Protoc</source><year>2023</year><month>10</month><day>17</day><volume>6</volume><issue>5</issue><fpage>101</fpage><pub-id pub-id-type="doi">10.3390/mps6050101</pub-id><pub-id pub-id-type="medline">37888033</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaughn</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Booth</surname><given-names>E</given-names> </name><name name-style="western"><surname>Burke</surname><given-names>JG</given-names> </name></person-group><article-title>Concept mapping methodology and community-engaged research: a perfect pairing</article-title><source>Eval Program Plann</source><year>2017</year><month>02</month><volume>60</volume><fpage>229</fpage><lpage>237</lpage><pub-id pub-id-type="doi">10.1016/j.evalprogplan.2016.08.013</pub-id><pub-id pub-id-type="medline">27591958</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rosas</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Kane</surname><given-names>M</given-names> </name></person-group><article-title>Quality and rigor of the concept mapping methodology: a pooled study analysis</article-title><source>Eval Program Plann</source><year>2012</year><month>05</month><volume>35</volume><issue>2</issue><fpage>236</fpage><lpage>245</lpage><pub-id pub-id-type="doi">10.1016/j.evalprogplan.2011.10.003</pub-id><pub-id pub-id-type="medline">22221889</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Mclinden</surname><given-names>D</given-names> </name><name name-style="western"><surname>Vaughn</surname><given-names>LM</given-names> </name></person-group><article-title>Concept mapping</article-title><source>Handbook of Methodological Approaches to Community-Based Research: Qualitative, Quantitative, and Mixed Methods</source><year>2016</year><publisher-name>Oxford University Press</publisher-name><fpage>305</fpage><lpage>314</lpage><pub-id pub-id-type="doi">10.1093/med:psych/9780190243654.003.0030</pub-id><pub-id pub-id-type="other">9780190243654</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Trochim</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kane</surname><given-names>M</given-names> </name></person-group><article-title>Concept mapping: an introduction to structured conceptualization in health care</article-title><source>Int J Qual Health Care</source><year>2005</year><month>06</month><volume>17</volume><issue>3</issue><fpage>187</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.1093/intqhc/mzi038</pub-id><pub-id pub-id-type="medline">15872026</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ashe</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Azim</surname><given-names>FT</given-names> </name><name name-style="western"><surname>Ariza-Vega</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Determinants of implementing reablement into research or practice: a concept mapping study</article-title><source>Physiother Res Int</source><year>2022</year><month>07</month><volume>27</volume><issue>3</issue><fpage>e1949</fpage><pub-id pub-id-type="doi">10.1002/pri.1949</pub-id><pub-id pub-id-type="medline">35434890</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Windsor</surname><given-names>LC</given-names> </name></person-group><article-title>Using concept mapping in community-based participatory research: a mixed methods approach</article-title><source>J Mix Methods Res</source><year>2013</year><month>07</month><volume>7</volume><issue>3</issue><fpage>274</fpage><lpage>293</lpage><pub-id pub-id-type="doi">10.1177/1558689813479175</pub-id><pub-id pub-id-type="medline">26561484</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassmiller Lich</surname><given-names>K</given-names> </name><name name-style="western"><surname>Urban</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Frerichs</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dave</surname><given-names>G</given-names> </name></person-group><article-title>Extending systems thinking in planning and evaluation using group concept mapping and system dynamics to tackle complex problems</article-title><source>Eval Program Plann</source><year>2017</year><month>02</month><volume>60</volume><fpage>254</fpage><lpage>264</lpage><pub-id pub-id-type="doi">10.1016/j.evalprogplan.2016.10.008</pub-id><pub-id pub-id-type="medline">27825622</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McLinden</surname><given-names>D</given-names> </name></person-group><article-title>And then the internet happened: thoughts on the future of concept mapping</article-title><source>Eval Program Plann</source><year>2017</year><month>02</month><volume>60</volume><fpage>293</fpage><lpage>300</lpage><pub-id pub-id-type="doi">10.1016/j.evalprogplan.2016.10.009</pub-id><pub-id pub-id-type="medline">27793357</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaughn</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Jacquez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Marschner</surname><given-names>D</given-names> </name><name name-style="western"><surname>McLinden</surname><given-names>D</given-names> </name></person-group><article-title>See what we say: using concept mapping to visualize Latino immigrant&#x2019;s strategies for health interventions</article-title><source>Int J Public Health</source><year>2016</year><month>09</month><volume>61</volume><issue>7</issue><fpage>837</fpage><lpage>845</lpage><pub-id pub-id-type="doi">10.1007/s00038-016-0838-4</pub-id><pub-id pub-id-type="medline">27220545</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Montgomery</surname><given-names>L</given-names> </name><name name-style="western"><surname>Vaughn</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Jacquez</surname><given-names>F</given-names> </name></person-group><article-title>Engaging adolescents in the fight against drug abuse and addiction: a concept mapping approach</article-title><source>Health Educ Behav</source><year>2022</year><month>04</month><volume>49</volume><issue>2</issue><fpage>272</fpage><lpage>280</lpage><pub-id pub-id-type="doi">10.1177/10901981211068416</pub-id><pub-id pub-id-type="medline">35043709</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaughn</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Sunny</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Lindquist-Grantz</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Successful suicide screening in the pediatric emergency department: youth, parent, researcher, and clinician perspectives</article-title><source>Arch Suicide Res</source><year>2020</year><volume>24</volume><issue>sup1</issue><fpage>124</fpage><lpage>141</lpage><pub-id pub-id-type="doi">10.1080/13811118.2018.1541034</pub-id><pub-id pub-id-type="medline">30537901</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>F</given-names> </name><name name-style="western"><surname>Fredriksson</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gunnarsd&#x00F3;ttir</surname><given-names>K&#x00C1;</given-names> </name><name name-style="western"><surname>Holtenman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Carlsson</surname><given-names>C</given-names> </name></person-group><article-title>Increasing credibility in government assignments: an example from Sweden of stakeholder involvement by using concept mapping</article-title><source>BMJ Open Qual</source><year>2025</year><month>06</month><day>12</day><volume>14</volume><issue>2</issue><fpage>e003021</fpage><pub-id pub-id-type="doi">10.1136/bmjoq-2024-003021</pub-id><pub-id pub-id-type="medline">40514055</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>F</given-names> </name><name name-style="western"><surname>Hellstr&#x00F6;m</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gunnarsd&#x00F3;ttir</surname><given-names>K&#x00C1;</given-names> </name><etal/></person-group><article-title>Exploring the meaning, role and experiences of a patient-led social innovation for people affected by cancer: a new collaborative care model complementing traditional cancer rehabilitation in Sweden</article-title><source>BMJ Open Qual</source><year>2021</year><month>10</month><volume>10</volume><issue>4</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1136/bmjoq-2021-001400</pub-id><pub-id pub-id-type="medline">34686486</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gunnarsd&#x00F3;ttir</surname><given-names>K&#x00C1;</given-names> </name><name name-style="western"><surname>Genell</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluating the implementation and use of the regional cancer plan in Western Sweden through concept mapping</article-title><source>Int J Qual Health Care</source><year>2019</year><month>08</month><day>1</day><volume>31</volume><issue>7</issue><fpage>44</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.1093/intqhc/mzy241</pub-id><pub-id pub-id-type="medline">30576515</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jelodar</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Latent Dirichlet allocation (LDA) and topic modeling: models, applications, a survey</article-title><source>Multimed Tools Appl</source><year>2019</year><month>06</month><volume>78</volume><issue>11</issue><fpage>15169</fpage><lpage>15211</lpage><pub-id pub-id-type="doi">10.1007/s11042-018-6894-4</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blei</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>MI</given-names> </name></person-group><article-title>Latent Dirichlet allocation</article-title><source>J Mach Learn Res</source><year>2003</year><access-date>2026-06-16</access-date><volume>3</volume><fpage>993</fpage><lpage>1022</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf">https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grootendorst</surname><given-names>M</given-names> </name></person-group><article-title>BERTopic: neural topic modeling with a class-based TF-IDF procedure</article-title><source>arXiv</source><comment>Preprint posted online on  May 11, 2022</comment><comment>arXiv:2203.05794</comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Egger</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>J</given-names> </name></person-group><article-title>A topic modeling comparison between LDA, NMF, Top2Vec, and BERTopic to demystify Twitter posts</article-title><source>Front Sociol</source><year>2022</year><volume>7</volume><issue>7</issue><fpage>886498</fpage><pub-id pub-id-type="doi">10.3389/fsoc.2022.886498</pub-id><pub-id pub-id-type="medline">35602001</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><access-date>2026-06-16</access-date><conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 2-7, 2019</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/N19-1423.pdf">https://aclanthology.org/N19-1423.pdf</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Reimers</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gurevych</surname><given-names>I</given-names> </name></person-group><article-title>Sentence-BERT: sentence embeddings using Siamese BERT-networks</article-title><conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name><conf-date>Nov 3-7, 2019</conf-date><pub-id pub-id-type="doi">10.18653/v1/D19-1410</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prescott</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Yeager</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ham</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Comparing the efficacy and efficiency of human and generative AI: qualitative thematic analyses</article-title><source>JMIR AI</source><year>2024</year><month>08</month><day>2</day><volume>3</volume><fpage>e54482</fpage><pub-id pub-id-type="doi">10.2196/54482</pub-id><pub-id pub-id-type="medline">39094113</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morgan</surname><given-names>DL</given-names> </name></person-group><article-title>Exploring the use of artificial intelligence for qualitative data analysis: the case of ChatGPT</article-title><source>Int J Qual Methods</source><year>2023</year><month>10</month><volume>22</volume><pub-id pub-id-type="doi">10.1177/16094069231211248</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>NC</given-names> </name><name name-style="western"><surname>Drouhard</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kocielnik</surname><given-names>R</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>J</given-names> </name><name name-style="western"><surname>Aragon</surname><given-names>CR</given-names> </name></person-group><article-title>Using machine learning to support qualitative coding in social science: shifting the focus to ambiguity</article-title><source>ACM Trans Interact Intell Syst</source><year>2018</year><month>06</month><day>30</day><volume>8</volume><issue>2</issue><fpage>1</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.1145/3185515</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><article-title>IdeaDistiller</article-title><source>Chalmers GitLab</source><access-date>2026-06-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://git.chalmers.se/e-commons/publications/ideadistiller">https://git.chalmers.se/e-commons/publications/ideadistiller</ext-link></comment></nlm-citation></ref></ref-list></back></article>