<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e54133</article-id><article-id pub-id-type="doi">10.2196/54133</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Robust Automated Harmonization of Heterogeneous Data Through Ensemble Machine Learning: Algorithm Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Doris</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Doudou</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cai</surname><given-names>Steven</given-names></name><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gan</surname><given-names>Ziming</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pencina</surname><given-names>Michael</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Avillach</surname><given-names>Paul</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Cai</surname><given-names>Tianxi</given-names></name><degrees>SCD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Hong</surname><given-names>Chuan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biomedical Informatics, Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Statistics and Data Science, National University of Singapore</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff3"><institution>Department of Computer Science, Rensselaer Polytechnic Institute</institution><addr-line>Rochester</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Statistics, University of Chicago</institution><addr-line>Chicago</addr-line><addr-line>IL</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Biostatistics &#x0026; Bioinformatics, Duke University</institution><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Alam</surname><given-names>Fakhare</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Lei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Chuan Hong, PhD, Department of Biostatistics &#x0026; Bioinformatics, Duke University, 2424 Erwin Rd, Room 9022, Durham, NC, 27710, United States, 1 9035269514; <email>chuan.hong@duke.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>22</day><month>1</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e54133</elocation-id><history><date date-type="received"><day>30</day><month>10</month><year>2023</year></date><date date-type="rev-recd"><day>26</day><month>11</month><year>2024</year></date><date date-type="accepted"><day>30</day><month>11</month><year>2024</year></date></history><copyright-statement>&#x00A9;Doris Yang, Doudou Zhou, Steven Cai, Ziming Gan, Michael Pencina, Paul Avillach, Tianxi Cai, Chuan Hong. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 22.1.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e54133"/><abstract><sec><title>Background</title><p>Cohort studies contain rich clinical data across large and diverse patient populations and are a common source of observational data for clinical research. Because large scale cohort studies are both time and resource intensive, one alternative is to harmonize data from existing cohorts through multicohort studies. However, given differences in variable encoding, accurate variable harmonization is difficult.</p></sec><sec><title>Objective</title><p>We propose SONAR (Semantic and Distribution-Based Harmonization) as a method for harmonizing variables across cohort studies to facilitate multicohort studies.</p></sec><sec sec-type="methods"><title>Methods</title><p>SONAR used semantic learning from variable descriptions and distribution learning from study participant data. Our method learned an embedding vector for each variable and used pairwise cosine similarity to score the similarity between variables. This approach was built off 3 National Institutes of Health cohorts, including the Cardiovascular Health Study, the Multi-Ethnic Study of Atherosclerosis, and the Women&#x2019;s Health Initiative. We also used gold standard labels to further refine the embeddings in a supervised manner.</p></sec><sec sec-type="results"><title>Results</title><p>The method was evaluated using manually curated gold standard labels from the 3 National Institutes of Health cohorts. We evaluated both the intracohort and intercohort variable harmonization performance. The supervised SONAR method outperformed existing benchmark methods for almost all intracohort and intercohort comparisons using area under the curve and top<italic>-k</italic> accuracy metrics. Notably, SONAR was able to significantly improve harmonization of concepts that were difficult for existing semantic methods to harmonize.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>SONAR achieves accurate variable harmonization within and between cohort studies by harnessing the complementary strengths of semantic learning and variable distribution learning.</p></sec></abstract><kwd-group><kwd>ensemble learning</kwd><kwd>semantic learning</kwd><kwd>distribution learning</kwd><kwd>variable harmonization</kwd><kwd>machine learning</kwd><kwd>cardiovascular health study</kwd><kwd>intracohort comparison</kwd><kwd>intercohort comparison</kwd><kwd>gold standard labels</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Data harmonization, the process that ensures the compatibility of diverse datasets for their cogent integration, is an indispensable tool in today&#x2019;s data-driven research environment [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. The power of data harmonization lies in its capacity to enhance the statistical robustness of studies, thereby enabling the investigation of intricate research questions unattainable within a single dataset&#x2019;s limits. This ability to pool data from existing sources expedites research processes, reduces associated costs, and accelerates the translation of knowledge into practical applications [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, despite the advantages of pooling data, the path to effective data harmonization is laden with challenges [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. The most pronounced among these is the discrepancies in how individual datasets document and measure similar concepts [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Even within datasets, documentation for analogous concepts is not consistent, thereby further complicating data integration.</p><p>Current data harmonization techniques mainly depend on manual curation [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. In spite of its widespread use, manual curation is time-intensive, prone to human error, and often constrained in scope, focusing predominantly on a single disease or condition. These drawbacks limit the applicability and efficacy of manual curation in broader, more complex contexts and highlight the need for advanced harmonization methodologies [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Recently, there has been a shift towards automated techniques, like freely available mapping tools [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>] and algorithms based on corpora or lexicons [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. These tools aim to map terminologies across varied clinical domains. Yet, such methods might still necessitate significant domain expertise and depend on benchmark labels. Furthermore, many of these methods cater to only one kind of medical code, for instance, drug or lab codes.</p><p>Another promising approach for data harmonization is through semantic learning. In a study by Zhou et al [<xref ref-type="bibr" rid="ref24">24</xref>], an automated harmonization algorithm was proposed to cotrain embeddings for electronic health record codes from multiple institutions by combining both electronic health record co-occurrence information and textual information from the code descriptions. As a technique that uses machine learning to infer meaning from data, semantic learning presents a promising avenue for enhancing data harmonization. However, semantic learning&#x2019;s applicability is limited by its demand for extensive, high-quality training data, its sensitivity to noisy or unreliable data, and the complexity involved in manually crafting semantic features.</p><p>In this paper, we propose SONAR (Semantic and Distribution-Based Harmonization), an innovative data harmonization approach that synthesizes the strengths of semantic learning with patient data learning. Patient data offers an alternative, unexplored source of learning for data harmonization purposes. The patient-level values for each variable provide information about the underlying concept that a variable measures, separate from the textual information in variable descriptions. By harnessing the context comprehension and inferential power of semantic learning and augmenting it with the capacity of patient data learning to capture concept-specific trends and nuances, we propose a more robust and accurate data harmonization strategy. We demonstrate the implementation and advantages of the proposed approach through its application across 3 major National Institutes of Health cohort studies: the Multi-Ethnic Study of Atherosclerosis (MESA) [<xref ref-type="bibr" rid="ref25">25</xref>], the Cardiovascular Health Study (CHS) [<xref ref-type="bibr" rid="ref26">26</xref>], and the Women&#x2019;s Health Initiative (WHI) [<xref ref-type="bibr" rid="ref27">27</xref>]. Our aspiration is that the method proposed here will provide a valuable foundation for future studies aiming to tackle the multifaceted challenges of data harmonization between heterogeneous datasets.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>The data used in this study were obtained from 3 well-established cohort studies, namely CHS, MESA, and WHI. Ethics approval (IRB17-2059) was granted by the Institutional Review Board of the Harvard Faculty of Medicine. Institutional Review Board approval was secured for access to all studies&#x2019; retrospective data. De-identified data was accessed through a secure cloud storage platform. Participants were not compensated for the use of their data in this study.</p></sec><sec id="s2-2"><title>Data Sources</title><p>The CHS was a population-based longitudinal study initiated to determine the risk factors for the development and progression of clinically validated cardiovascular disease in adults aged 65 years and older. Beginning in 1989, the study enrolled 5888 participants from 4 US communities: Forsyth County, NC; Sacramento County, CA; Washington County, MD; and Pittsburgh, PA. The cohort consisted of two recruitment waves: the original cohort (1989-1990) and the African American cohort (1992-1993). Comprehensive baseline examinations were conducted, including medical history, physical examinations, laboratory tests, and others, with annual follow-ups to ascertain cardiovascular events [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>The WHI was a long-term national health study that focused on strategies for preventing heart disease, breast and colorectal cancer, and osteoporotic fractures in postmenopausal women. Launched in 1991, the WHI involved multiple clinical trials and an observational study, enrolling a total of 161,808 women aged 50-79 years across 40 clinical centers throughout the United States. The participants were ethnically diverse, reflecting the demographic composition of the US population. Extensive data on lifestyle, health, and medical history were collected at baseline and at regular intervals throughout the study, creating a rich source of information for a variety of research endeavors [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>The MESA study was a prospective cohort designed to delve into the prevalence and progression of subclinical cardiovascular disease among community-dwelling adults. MESA assessed a diverse, population-based sample of 6814 asymptomatic men and women aged between 45 and 84 years from 2000 to 2018. The participants were recruited from 6 field centers across the United States, including Wake Forest University, Columbia University, Johns Hopkins University, University of Minnesota, Northwestern University, and University of California &#x2013; Los Angeles. The MESA cohort was made up of 38% White, 28% African-American, 22% Hispanic, and 12% Asian (primarily Chinese) individuals. Since its inception in July 2000, the study conducted 6 examinations, each occurring every 18 to 24 months [<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec><sec id="s2-3"><title>Data Extraction</title><p>The process of data extraction necessitated the gathering of documentation for variables within the CHS, MESA, and WHI studies. This information was procured from the Database of Genotypes and Phenotypes (dbGaP) [<xref ref-type="bibr" rid="ref28">28</xref>]. We used dbGaP metadata to procure the following salient information for each variable within the study: (1) variable accession, (2) variable name, (3) variable description, and (4) dataset accession. While the variable name and variable description were not necessarily unique within or between studies, the variable accession was a unique identifier across all studies. We used the variable description strings as the semantic data in our model. From the raw variable description strings, we further extracted and removed the temporal period during which the variables were measured by parsing for key temporal terms, such as visit and exam. This data extraction process facilitated a comprehensive understanding of the variables&#x2019; conceptual characteristics, thereby providing the foundation for the subsequent data harmonization efforts.</p><p>We used the dataset accession and variable accession identifiers to access (1) variable metadata and (2) the patient-level data for the set of variables already extracted from dbGaP. To allow for relevant distribution comparisons between variables, we kept only continuous data by filtering variables using the continuous flag in the metadata.</p></sec><sec id="s2-4"><title>Data Preprocessing</title><p>Our study scope was primarily focused on the harmonization of continuous variables at the conceptual level. A &#x201C;concept&#x201D; in this context was defined as the underlying notion or theme that a variable represents, independent of the specific unit or time point of measurement. For example, a biomarker such as C-reactive protein, despite being reported in different units across different visits, was treated as having the same concept. Moreover, concepts were sometimes encoded in natural language or questionnaire form, rather than standard medical terms. We focused on conceptual level harmonization for several reasons. Researchers conducting multicohort studies are often interested in identifying all variables corresponding to a concept. Depending on the application, they might be interested in further refining this concept-level harmonization or also harmonizing the variable values across different units or collection time periods. Removing temporal information and units allowed us to focus on the essential meaning or theme underlying the variable, thereby facilitating the primary task of concept-level harmonization, which is manually challenging and resource intensive, paving the way for further data harmonization. Moreover, doing so enhanced comparability across studies, as variables with the same concept were treated as equivalent, irrespective of the units used. During the initial phase of data preprocessing, we streamlined variable descriptions by eliminating temporal information phrases. This practice not only simplified the descriptions but also augmented their comparability.</p><p>We also applied filters to variables according to their values. First, we removed variables with incomplete patient data. To preserve a significant portion of variables, we considered incomplete patient data at the subgroup level rather than the individual patient level. We defined patient subgroups using the anchor variables of age, race, and sex. Per the characteristics of the study populations and data availability, we defined 4 age buckets (&#x2264;59 years, 60 to 69 years, 70 to 79 years, and &#x2265;80 years). For the categorical anchor variables, we used 2 predefined race categories (White and Black) and 2 predefined sex categories (female and male), yielding a total of 16 possible patient subgroups (calculated as 4&#x00D7;2&#x00D7;2). For each study, we removed variables that had no patient data for one or more subgroups, considering only patient subgroups that were represented in the cohort.</p><p>Second, we purged variables that had uniformly zero values across all patients. This removal was necessary as variables without variability do not offer predictive power and thus, contribute little to subsequent analyses. Subsequently, we identified variables within the same study with identical descriptions and treated them as a single entity. Rather than maintaining these as separate variables, we amalgamated their distribution vectors by computing their element-wise mean. This consolidation concurrently reduced redundancy and bolstered the statistical power and robustness of downstream analyses. The underlying principle driving these measures was the emphasis on the core conceptual content encapsulated within variables, a focus that lays the groundwork for a more efficient and meaningful process of data harmonization.</p></sec><sec id="s2-5"><title>Creation of Gold Standard Labels</title><p>To assess the harmonization accuracy of SONAR, we manually created a set of gold standard labels. The process began with the curation of a concept list, consisting of common diseases, laboratory results, and medications, consistent with the goal of harmonization at the conceptual level (details in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, Section 4). With the concept list in hand, each of the 3 independent reviewers assigned raw variables from all 3 studies to the corresponding concepts based on their descriptions. Not all variables had labels, since the curated concept list was not comprehensive of all underlying concepts present in the studies. To ensure consistency and accuracy, we adopted a consensus-based approach for handling any discrepancies among the reviewers. In cases of disagreements, the reviewers discussed their rationales for their assignments, and through a process of discussion, literature review, and majority vote, they reached a consensus on the appropriate concept assignment. This rigorously prepared set of annotations, backed by consensus, formed our gold standard labels. In particular, each pair of variables corresponding to the same underlying concept formed a gold standard pair (ie, a pair of variables that a harmonization algorithm should map to each other). Such pairs consisted of variables from two different datasets (intercohort) or the same dataset (intracohort), since multiple variables from a single dataset could correspond to the same underlying concept. These labels offered a reliable standard against which we could validate our semantic learning and patient data learning techniques.</p></sec><sec id="s2-6"><title>SONAR (Semantic and Distribution-Based Harmonization)</title><p>The proposed SONAR approach had 4 steps, including semantic learning, distribution learning, concatenation of the two learnings, and supervised training. Underpinning both semantic learning and distribution learning was the idea that variables with similar textual descriptions and patient-level value distributions were more likely to encode the same underlying concept.</p><sec id="s2-6-1"><title>Step 1: Semantic Learning</title><p>We combined two existing pretrained large language models (LLMs) for semantic learning, CODER (Crosslingual Knowledge-Infused Medical Term Embedding) [<xref ref-type="bibr" rid="ref29">29</xref>] and SapBERT (Self-Alignment Pretraining for Biomedical Entity Representations) [<xref ref-type="bibr" rid="ref30">30</xref>]. CODER, a semantic representation learning tool, is a type of pretrained language model using a contrastive learning framework [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. It is particularly suited to the biomedical terms and descriptions found in clinical studies because it was trained on terms, concepts, and their relations in the Unified Medical Language System (UMLS) [<xref ref-type="bibr" rid="ref32">32</xref>] knowledge graph. SapBERT is a pretrained, masked learning model also trained on synonyms in the UMLS knowledge graph. Both CODER and SapBERT create embedding vector representations from textual input, which were the variable description strings in our method. Combining CODER and SapBERT, which use different pretraining algorithms and training sets within the same knowledge graph, allowed us to increase the robustness of our semantic embeddings. The advantage of using these existing language models pretrained was a balance between saved training time and specificity to the domain and task. The output of this step was a CODER embedding vector (<italic>VAR_coder</italic>) and a SapBERT embedding vector (<italic>VAR_sapbert</italic>) for each variable. The goal of this process was to transform the variable descriptions into a uniform, computable format that captured their semantic essence.</p></sec><sec id="s2-6-2"><title>Step 2: Distribution Learning</title><p>In order to conduct comparisons of patient-level values for pairs of variables, we constructed vectors encoding the distributions of variables. For each study, we aggregated patients into the previously defined 16 subgroups, using the anchor variables of age, race, and sex. These anchor variables were present in all studies and clinically relevant to most of the other study variables. It was possible and permissible that the number of anchor groups varied across different cohorts. For instance, in the WHI cohort that consisted only of women, the number of anchor groups was reduced to 8 (4&#x00D7;1&#x00D7;2). Then, we computed the subgroup quartiles (ie, the 25th, 50th, and 75th percentiles), thus yielding a numerical distribution vector for each variable (<italic>VAR_dist</italic>) of up to length 48 (16&#x00D7;3). This process was designed to capture the distribution characteristics of each variable within defined anchor groups, thereby adding a computationally efficient layer of contextual understanding to our harmonization strategy. Moreover, the quartile distribution encoding strategy allowed for greater flexibility in data handling by preserving patient confidentiality.</p></sec><sec id="s2-6-3"><title>Step 3: Concatenation</title><p>This stage combined the insights gained from semantic learning and distribution learning. Specifically, for each variable, we concatenated its <italic>VAR_coder</italic>, <italic>VAR_sapbert</italic>, and <italic>VAR_dist</italic> vectors into a single <italic>VAR_concat</italic> vector. In order to ensure standardized comparisons with <italic>VAR_concat</italic> vectors of the same length, we kept only the variable distribution dimensions in <italic>VAR_dist</italic> corresponding to anchor groups present in both cohorts in each interdataset harmonization. We kept all available variable distribution dimensions for intradataset harmonization. Prior to concatenation, we also normalized the <italic>VAR_coder</italic>, <italic>VAR_sapbert</italic>, and <italic>VAR_dist</italic> vectors separately to ensure they operated on the same scale. In particular, we took the absolute magnitude of each element in a vector, selected the maximum value among these values, then divided the original vector by the maximum absolute value. After normalization in this manner, the elements in each vector were bound to the &#x2212;1 to 1 range. This was crucial because it ensured that no vector&#x2019;s magnitude dominated during the concatenation process, thereby preserving the integrity of the information they conveyed. We then concatenated the normalized <italic>VAR_coder</italic>, <italic>VAR_sapbert</italic>, and <italic>VAR_dist</italic> vectors, resulting in a vector for each variable that we denoted as <italic>VAR_concat</italic> with dimension <italic>d</italic>. This concatenated vector, containing both semantic and distribution information, formed the foundation of our automated harmonization strategy.</p></sec><sec id="s2-6-4"><title>Step 4: Supervised Training</title><p>We further refined the SONAR method through supervised training of a <inline-formula><mml:math id="ieqn1"><mml:mi>d</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:math></inline-formula> rotation matrix <italic>M</italic> through gradient descent of a loss function (details in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, Section 1). The supervised embeddings were then the cross product of the unsupervised SONAR embeddings and <italic>M</italic>. Overall, this 4-step process served to capture the nuances and complexities of variable-concept relationships in a computationally efficient and robust manner. The systematic integration of semantic learning with distribution learning offered an innovative approach to data harmonization, promising to enhance accuracy, efficiency, and overall applicability of harmonization strategies.</p></sec></sec><sec id="s2-7"><title>Evaluation Metrics</title><p>We assessed the performance of the proposed SONAR approach both within individual cohorts (intracohort) and between different cohorts (intercohort). We considered the area under the curve (AUC) of the receiver operating characteristic curve as the overall metric for harmonization accuracy. Specifically, for each underlying concept, we first computed the cosine similarity of the embedding vectors for known concept pairs (true positives) and an equal number of randomly selected concept pairs (false positives), where the cosine similarity measured the cosine of the angle between 2 vectors in a multidimensional space, effectively quantifying how similar they are. The AUC was then calculated, summarizing the overall accuracy of our method across varying decision thresholds for a given underlying concept. The overall AUC was the average of the concept-level AUC values. To reduce the effect of outliers within the sampled negative pairs, we averaged the 3 overall AUC values calculated for 3 sets of sampled negative pairs.</p><p>We also evaluated the performance of SONAR on hard concepts, defined to be concepts for which the benchmark SapBERT AUC was below the threshold of 0.900. The hard AUC was the average of the concept-level AUC values for the hard concepts. This was an important metric for demonstrating the added value of distribution learning and more broadly the complementary effects of the two forms of learning.</p><p>To further scrutinize the performance of our method, we reported the top<italic>-k</italic> accuracy (acc@<italic>k</italic>) for mapping of codes from Cohort A to Cohort B, where the cohorts were identical for intradataset mapping. For variable <inline-formula><mml:math id="ieqn2"><mml:mi>a</mml:mi></mml:math></inline-formula> in Cohort A, we let <inline-formula><mml:math id="ieqn3"><mml:msub><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> be the set of variables within Cohort B with embeddings that had the largest cosine similarity with variable <inline-formula><mml:math id="ieqn4"><mml:mi>a</mml:mi></mml:math></inline-formula>, and we let <inline-formula><mml:math id="ieqn5"><mml:msub><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> be the set of variables within Cohort B that were in a positive gold standard pair with variable <inline-formula><mml:math id="ieqn6"><mml:mi>a</mml:mi></mml:math></inline-formula>. Then, the acc@<italic>k</italic> for an underlying concept for the mapping from Cohort A to Cohort B was the number of codes <inline-formula><mml:math id="ieqn7"><mml:mi>a</mml:mi></mml:math></inline-formula> such that <inline-formula><mml:math id="ieqn8"><mml:msub><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2229;</mml:mo><mml:msub><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2260;</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula> divided by the total number of gold standard variables in Cohort A corresponding to the underlying concept. The acc@<italic>k</italic> for the mapping from Cohort A to Cohort B was then the average of the acc<italic>@k</italic> across all underlying concepts. This prevented dominance by underlying concepts with many corresponding gold standard variables. To calculate the intracohort acc@<italic>k</italic>, we first computed the intracohort acc@<italic>k</italic> for each underlying concept by averaging the acc@<italic>k</italic> for that underlying concept across all 3 intracohort comparisons (intra-CHS, intra-MESA, and intra-WHI). Underlying concepts were only averaged over the comparisons for which they were relevant, so the intracohort acc@<italic>k</italic> for a concept that had gold standard pairs in only CHS and MESA was be the average of the CHS acc@<italic>k</italic> and the MESA acc@<italic>k</italic> for the concept in question. Then, the intracohort acc@<italic>k</italic> was the average of the acc@k across all underlying concepts. The intercohort acc@<italic>k</italic> was calculated similarly, except across the 6 intercohort mappings (CHS to MESA, CHS to WHI, MESA to CHS, MESA to WHI, WHI to CHS, WHI to MESA). Finally, the overall acc<italic>@k</italic> was calculated across the 9 total mappings for each underlying variable. We obtained acc@<italic>k</italic> values for <italic>k</italic> values of 1, 3, 5, 10, 20 in order to provide additional insight into SONAR&#x2019;s effectiveness at capturing true positives at different thresholds. These rigorous evaluations allowed us to confidently assert the efficacy of our harmonization method in both intracohort and intercohort settings.</p><p>For the supervised portion of our method, we used 2-fold cross-validation. We pooled cosine similarity values from both rounds of training before AUC calculations and averaged the 2 resulting cosine similarity values for non&#x2013;gold standard pairs before acc@<italic>k</italic> calculations. To provide a comprehensive understanding of our method&#x2019;s performance, we compared the AUC and other accuracy metrics obtained by SONAR with those obtained when using semantic learning (BioBERT [Bidirectional Encoder Representations from Transformers for Biomedical Text Mining], CODER, SapBERT) or distribution learning alone. We also obtained metrics for the concatenated semantic portion of SONAR (ie, CODER concatenated with SapBERT) to further highlight the added value of distribution learning. This comparative approach allowed us to illuminate the relative strengths and contributions of the individual components of our method and the added value achieved by their combination.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Data Extraction and Preprocessing</title><p>The proposed SONAR approach had 4 steps, including semantic learning, distribution learning, concatenation of the two learnings, and supervised training (<xref ref-type="fig" rid="figure1">Figure 1</xref>). We extracted metadata and semantic data from the dbGaP for 14,717 CHS variables, 22,147 MESA variables, and 6207 WHI variables. Filtering for continuous, complete, and nonzero variables using metadata and patient-level data from Service WorkBench, as well as consolidating variables with identical semantic descriptions yielded 2076 CHS variables, 2525 MESA variables, and 1328 WHI variables. Patient data was available for 12 subgroups, 16 subgroups, and 6 subgroups for CHS, MESA, and WHI, respectively. This yielded distribution vectors of length 36, 48, and 18 for intra-CHS, intra-MESA, and intra-WHI harmonization, respectively. Based on overlapping patient subgroups between the cohorts, we used distribution vectors of length 36, 12, and 18 for intercohort CHS-MESA, CHS-WHI, and MESA-WHI harmonization, respectively.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of semantic and distribution based harmonization. (A) Overall workflow of semantic and distribution based harmonization. (B) Workflow of the distribution learning step. CODER: Crosslingual Knowledge-Infused Medical Term Embedding; SapBERT: Self-Alignment Pretraining for Biomedical Entity Representations; SONAR: Semantic and Distribution-Based Harmonization. Var: variable.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e54133_fig01.png"/></fig></sec><sec id="s3-2"><title>Gold Standard Labels</title><p>We identified a total of 123 concepts with continuous data, consisting of 112 laboratory test concepts, 6 disease concepts, and 5 medication concepts. A total of 531 variables across all cohorts were identified as gold standard representations of these concepts. These yielded 606, 318, and 89 gold standard concept pairs for intradataset harmonization evaluation within the CHS, MESA, and WHI, respectively. For interdataset harmonization evaluation, we had 352, 325, and 133 gold standard concept pairs for CHS-MESA, CHS-WHI, and MESA-WHI, respectively. Detailed numerical summaries of gold standard labels are provided in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Data preprocessing and gold standard labels.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variables</td><td align="left" valign="bottom" colspan="3">Intracohort, n</td><td align="left" valign="bottom" colspan="3">Intercohort, n</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">CHS<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">MESA<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">WHI<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">CHS-MESA</td><td align="left" valign="top">CHS-WHI</td><td align="left" valign="top">MESA-WHI</td></tr></thead><tbody><tr><td align="left" valign="top">dbGaP<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> variables</td><td align="left" valign="top">14,717</td><td align="left" valign="top">22,147</td><td align="left" valign="top">6207</td><td align="left" valign="top">36,864</td><td align="left" valign="top">51,581</td><td align="left" valign="top">28,354</td></tr><tr><td align="left" valign="top">Preprocessed variables</td><td align="left" valign="top">2076</td><td align="left" valign="top">2525</td><td align="left" valign="top">1328</td><td align="left" valign="top">4601</td><td align="left" valign="top">3404</td><td align="left" valign="top">3853</td></tr><tr><td align="left" valign="top">Distribution dimensions</td><td align="left" valign="top">36</td><td align="left" valign="top">48</td><td align="left" valign="top">18</td><td align="left" valign="top">36</td><td align="left" valign="top">12</td><td align="left" valign="top">18</td></tr><tr><td align="left" valign="top">Gold standard concepts</td><td align="left" valign="top">53</td><td align="left" valign="top">39</td><td align="left" valign="top">32</td><td align="left" valign="top">53</td><td align="left" valign="top">54</td><td align="left" valign="top">48</td></tr><tr><td align="left" valign="top">Gold standard variables</td><td align="left" valign="top">204</td><td align="left" valign="top">125</td><td align="left" valign="top">86</td><td align="left" valign="top">242</td><td align="left" valign="top">229</td><td align="left" valign="top">146</td></tr><tr><td align="left" valign="top">Gold standard pairs</td><td align="left" valign="top">606</td><td align="left" valign="top">318</td><td align="left" valign="top">89</td><td align="left" valign="top">352</td><td align="left" valign="top">325</td><td align="left" valign="top">133</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CHS: Cardiovascular Health Study.</p></fn><fn id="table1fn2"><p><sup>b</sup>MESA: Multi-Ethnic Study of Atherosclerosis.</p></fn><fn id="table1fn3"><p><sup>c</sup>WHI: Women&#x2019;s Health Initiative.</p></fn><fn id="table1fn4"><p><sup>d</sup>dbGaP: Database of Genotypes and Phenotypes.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Intracohort Evaluation</title><p>Supervised SONAR achieved a strong performance across all intracohort AUC (<xref ref-type="fig" rid="figure2">Figure 2</xref> and Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and acc@<italic>k</italic> (<xref ref-type="fig" rid="figure3">Figure 3</xref> and Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) measures, exceeding or meeting all benchmark comparisons. The number of hard concepts for each intracomparison was 13 concepts, 5 concepts, and 5 concepts for intra-CHS, intra-MESA, and intra-WHI, respectively. It is notable that the distribution-only AUC was significantly higher than the semantic-only methods for the intra-CHS and intra-WHI hard concepts, illustrating the advantage of incorporating both semantic and distribution learning in SONAR. While the addition of supervised training only improved overall AUC performance for the intra-WHI comparison, it improved intracohort AUC performance on hard concepts for all 3 intracohort comparisons, exceeding all benchmark methods. The addition of supervised training also improved acc@<italic>k</italic> performance of SONAR across different values of <italic>k</italic>. Across intracohort evaluations, distribution learning provided a clear added value to semantic learning, in spite of a weaker distribution-only performance in comparison to the semantic components of SONAR (CODER only, SapBERT only, CODER + SapBERT). Moreover, there was not a single best semantic learning method between CODER and SapBERT using the various AUC and acc@<italic>k</italic> metrics, providing support for the use of both semantic learning methods in SONAR.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Comparison of areas under the curve for different methods in intracohort comparisons. AUC: area under the curve; BioBERT: Bidirectional Encoder Representations from Transformers for Biomedical Text Mining; CHS: Cardiovascular Health Study; CODER: Crosslingual Knowledge-Infused Medical Term Embedding; MESA: Multi-Ethnic Study of Atherosclerosis; SapBERT: Self-Alignment Pretraining for Biomedical Entity Representations; SONAR: Semantic and Distribution-Based Harmonization; WHI: Women&#x2019;s Health Initiative.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e54133_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Comparison of top-10 sensitivities for different methods in both intracohort and intercohort comparisons. acc@10: top-10 accuracy; BioBERT: Bidirectional Encoder Representations from Transformers for Biomedical Text Mining; CODER: Crosslingual Knowledge-Infused Medical Term Embedding; SapBERT: Self-Alignment Pretraining for Biomedical Entity Representations; SONAR: Semantic and Distribution-Based Harmonization.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e54133_fig03.png"/></fig></sec><sec id="s3-4"><title>Intercohort Evaluation</title><p>Supervised SONAR also achieved a consistently high performance in intercohort harmonization evaluation, exceeding or meeting all benchmark comparisons except for acc@3 and acc@20 (<xref ref-type="fig" rid="figure3">Figure 3</xref>). The number of hard concepts for each intracomparison was 4 concepts, 11 concepts, and 5 concepts for the CHS-MESA, CHS-WHI, and MESA-WHI comparisons, respectively. Similar to intracohort harmonization, supervised training improved AUC performance on hard concepts for all 3 comparisons, exceeding all benchmark methods. In contrast with intracohort harmonization, supervised training also improved AUC performance on all concepts to above 0.99 for all 3 intercohort comparisons (<xref ref-type="fig" rid="figure4">Figure 4</xref>). Notably, the CODER only and SapBERT only AUC values were higher for intercohort harmonization as compared to intracohort harmonization because identical variable descriptions were allowed for intercohort semantic learning.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Comparison of areas under the curve for different methods in the intercohort comparisons. AUC: area under the curve; BioBERT: Bidirectional Encoder Representations from Transformers for Biomedical Text Mining; CHS: Cardiovascular Health Study; CODER: Crosslingual Knowledge-Infused Medical Term Embedding; MESA: Multi-Ethnic Study of Atherosclerosis; SapBERT (Self-Alignment Pretraining for Biomedical Entity Representations); SONAR: Semantic and Distribution-Based Harmonization; WHI: Women&#x2019;s Health Initiative.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e54133_fig04.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>The SONAR method provides a robust method for concept-level data harmonization across and within longitudinal cohort studies by efficiently constructing variable embeddings from longitudinal cohort study variable descriptions and data. We used a supervised algorithm to refine the concatenated embeddings built from normalized distribution and semantic vectors. When applied to harmonization within and between 3 National Institutes of Health cohort studies, SONAR achieved enhanced performance compared to benchmark methods, with notable improvements over semantic-only and distribution-only methods. These results demonstrate the effectiveness of learning from both semantic and patient-level data. Our method is able to conduct this learning with relatively low training costs by taking advantage of the one-time pretraining of biomedical entity representation&#x2013;based language models using domain-specific UMLS terms.</p><p>There are some limitations to this study. We focused only on continuous variables with complete data, excluding categorical variables and variables with incomplete data. Future studies can expand variable distribution learning to categorical variables and develop methodologies for comparing distribution vectors of differing lengths in the case of variables with incomplete data, allowing for harmonization of a greater range of variables. Moreover, we focused on concept-level variable harmonization, which may be inappropriate for certain applications that require more granular harmonization, such as at the unit level or comparisons between different temporal periods. SONAR already drastically reduces the resources needed for concept-level harmonization, which is a crucial first step for more granular harmonization. Future studies could also automate the manual process of unit and temporal harmonization across variables corresponding to the same concept. Another direction for future research is automating the underlying concept identification process, perhaps by variable clustering using the newest generation of LLMs such as GPT-4. While powerful LLMs like GPT-4 could further improve the semantic learning aspect of our model, future research would need to adapt these generally trained models to the biomedical domain and control for the monetary costs associated with GPT-4 use. Additionally, although the current implementation of intercohort SONAR involves harmonization of 2 studies, it can be adapted to harmonize 3 or more studies.</p><p>SONAR paves the way for multicohort studies through high-quality and efficient variable harmonization. Harmonization at the concept-level is the crucial first step for researchers seeking to identify all variables corresponding to a disease, medication, or laboratory test of interest. Manual curation of or simple keyword searches for such variables are resource intensive and error-prone. The automation provided by SONAR is particularly helpful for harmonization of thousands of variables between large-scale cohort studies with heterogeneous variable encoding of underlying concepts. Multicohort studies that draw upon existing cohort studies are a resource-efficient method for studying risk factors associated with diseases and their pathogenesis. By effectively expanding the study population, multicohort studies also allow for greater statistical power and diversity in the study population, leading to greater generalizability of results and an enhanced ability to study health disparities. Beyond variable harmonization between cohorts, the variable embeddings generated through SONAR can be used for downstream analyses within multicohort studies, including for feature selection and the construction of knowledge graphs.</p><p>In conclusion, SONAR provides an approach for investigators to integrate semantic and patient data for multicohort variable harmonization. We demonstrate the robust added value of distribution learning when combined with existing semantic learning methods in variable mapping between cohorts. This innovation will facilitate and expedite multicohort studies by building upon existing data from decades-long cohort studies.</p></sec></body><back><ack><p>CH and MP&#x2019;s contributions to this work were partially funded by National Institute of Neurological Disorders and Stroke grant number R61-NS120246-02.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">acc@k</term><def><p>top-<italic>k</italic> accuracy</p></def></def-item><def-item><term id="abb2">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb3">BioBERT</term><def><p>Bidirectional Encoder Representations from Transformers for Biomedical Text Mining</p></def></def-item><def-item><term id="abb4">CHS</term><def><p>Cardiovascular Health Study</p></def></def-item><def-item><term id="abb5">CODER</term><def><p>Crosslingual Knowledge-Infused Medical Term Embedding</p></def></def-item><def-item><term id="abb6">dbGaP</term><def><p>Database of Genotypes and Phenotypes</p></def></def-item><def-item><term id="abb7">LLM</term><def><p>large language models</p></def></def-item><def-item><term id="abb8">MESA</term><def><p>Multi-Ethnic Study of Atherosclerosis</p></def></def-item><def-item><term id="abb9">SapBERT</term><def><p>Self-Alignment Pretraining for Biomedical Entity Representations</p></def></def-item><def-item><term id="abb10">SONAR</term><def><p>Semantic and Distribution-Based Harmonization</p></def></def-item><def-item><term id="abb11">UMLS</term><def><p>Unified Medical Language System</p></def></def-item><def-item><term id="abb12">WHI</term><def><p>Women&#x2019;s Health Initiative</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Curran</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Hussong</surname><given-names>AM</given-names> </name></person-group><article-title>Integrative data analysis: the simultaneous analysis of multiple data sets</article-title><source>Psychol Methods</source><year>2009</year><month>06</month><volume>14</volume><issue>2</issue><fpage>81</fpage><lpage>100</lpage><pub-id pub-id-type="doi">10.1037/a0015914</pub-id><pub-id pub-id-type="medline">19485623</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sansone</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Rocca-Serra</surname><given-names>P</given-names> </name><name name-style="western"><surname>Field</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Toward interoperable bioscience data</article-title><source>Nat Genet</source><year>2012</year><month>01</month><day>27</day><volume>44</volume><issue>2</issue><fpage>121</fpage><lpage>126</lpage><pub-id pub-id-type="doi">10.1038/ng.1054</pub-id><pub-id pub-id-type="medline">22281772</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shaw</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Ross</surname><given-names>JS</given-names> </name></person-group><article-title>US federal government efforts to improve clinical trial transparency with expanded trial registries and open data sharing</article-title><source>AMA J Ethics</source><year>2015</year><month>12</month><day>1</day><volume>17</volume><issue>12</issue><fpage>1152</fpage><lpage>1159</lpage><pub-id pub-id-type="doi">10.1001/journalofethics.2015.17.12.pfor1-1512</pub-id><pub-id pub-id-type="medline">26698589</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thompson</surname><given-names>A</given-names> </name></person-group><article-title>Thinking big: large-scale collaborative research in observational epidemiology</article-title><source>Eur J Epidemiol</source><year>2009</year><volume>24</volume><issue>12</issue><fpage>727</fpage><lpage>731</lpage><pub-id pub-id-type="doi">10.1007/s10654-009-9412-1</pub-id><pub-id pub-id-type="medline">19967428</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walport</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brest</surname><given-names>P</given-names> </name></person-group><article-title>Sharing research data to improve public health</article-title><source>Lancet</source><year>2011</year><month>02</month><day>12</day><volume>377</volume><issue>9765</issue><fpage>537</fpage><lpage>539</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(10)62234-9</pub-id><pub-id pub-id-type="medline">21216456</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bennett</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Caporaso</surname><given-names>N</given-names> </name><name name-style="western"><surname>Fitzpatrick</surname><given-names>AL</given-names> </name><etal/></person-group><article-title>Phenotype harmonization and cross-study collaboration in GWAS consortia: the GENEVA experience</article-title><source>Genet Epidemiol</source><year>2011</year><month>04</month><volume>35</volume><issue>3</issue><fpage>159</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.1002/gepi.20564</pub-id><pub-id pub-id-type="medline">21284036</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jovicich</surname><given-names>J</given-names> </name><name name-style="western"><surname>Barkhof</surname><given-names>F</given-names> </name><name name-style="western"><surname>Babiloni</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Harmonization of neuroimaging biomarkers for neurodegenerative diseases: a survey in the imaging community of perceived barriers and suggested actions</article-title><source>Alzheimers Dement (Amst)</source><year>2019</year><month>12</month><volume>11</volume><fpage>69</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.1016/j.dadm.2018.11.005</pub-id><pub-id pub-id-type="medline">31673595</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tratwal</surname><given-names>J</given-names> </name><name name-style="western"><surname>Labella</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bravenboer</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Reporting guidelines, review of methodological standards, and challenges toward harmonization in bone marrow adiposity research. Report of the Methodologies Working Group of the International Bone Marrow Adiposity Society</article-title><source>Front Endocrinol (Lausanne)</source><year>2020</year><volume>11</volume><fpage>65</fpage><pub-id pub-id-type="doi">10.3389/fendo.2020.00065</pub-id><pub-id pub-id-type="medline">32180758</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ohmann</surname><given-names>C</given-names> </name><name name-style="western"><surname>Banzi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Canham</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Sharing and reuse of individual participant data from clinical trials: principles and recommendations</article-title><source>BMJ Open</source><year>2017</year><month>12</month><day>14</day><volume>7</volume><issue>12</issue><fpage>e018647</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2017-018647</pub-id><pub-id pub-id-type="medline">29247106</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bakalov</surname><given-names>V</given-names> </name><name name-style="western"><surname>Cox</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Identifying datasets for cross-study analysis in dbGaP using PhenX</article-title><source>Sci Data</source><year>2022</year><month>09</month><day>1</day><volume>9</volume><issue>1</issue><fpage>532</fpage><pub-id pub-id-type="doi">10.1038/s41597-022-01660-4</pub-id><pub-id pub-id-type="medline">36050327</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simko</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Amtmann</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Challenges to the standardization of trauma data collection in burn, traumatic brain injury, spinal cord injury, and other trauma populations: a call for common data elements for acute and longitudinal trauma databases</article-title><source>Arch Phys Med Rehabil</source><year>2019</year><month>05</month><volume>100</volume><issue>5</issue><fpage>891</fpage><lpage>898</lpage><pub-id pub-id-type="doi">10.1016/j.apmr.2018.10.004</pub-id><pub-id pub-id-type="medline">31030731</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stilp</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Emery</surname><given-names>LS</given-names> </name><name name-style="western"><surname>Broome</surname><given-names>JG</given-names> </name><etal/></person-group><article-title>A system for phenotype harmonization in the National Heart, Lung, and Blood Institute Trans-Omics for Precision Medicine (TOPMed) program</article-title><source>Am J Epidemiol</source><year>2021</year><month>10</month><day>1</day><volume>190</volume><issue>10</issue><fpage>1977</fpage><lpage>1992</lpage><pub-id pub-id-type="doi">10.1093/aje/kwab115</pub-id><pub-id pub-id-type="medline">33861317</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tudur Smith</surname><given-names>C</given-names> </name><name name-style="western"><surname>Nevitt</surname><given-names>S</given-names> </name><name name-style="western"><surname>Appelbe</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Resource implications of preparing individual participant data from a clinical trial to share with external researchers</article-title><source>Trials</source><year>2017</year><month>07</month><day>17</day><volume>18</volume><issue>1</issue><fpage>319</fpage><pub-id pub-id-type="doi">10.1186/s13063-017-2067-4</pub-id><pub-id pub-id-type="medline">28712359</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fortier</surname><given-names>I</given-names> </name><name name-style="western"><surname>Raina</surname><given-names>P</given-names> </name><name name-style="western"><surname>Van den Heuvel</surname><given-names>ER</given-names> </name><etal/></person-group><article-title>Maelstrom Research guidelines for rigorous retrospective data harmonization</article-title><source>Int J Epidemiol</source><year>2017</year><month>02</month><day>1</day><volume>46</volume><issue>1</issue><fpage>103</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1093/ije/dyw075</pub-id><pub-id pub-id-type="medline">27272186</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burns</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>PW</given-names> </name></person-group><article-title>Learning what we didn&#x2019;t know - the SPRINT data analysis challenge</article-title><source>N Engl J Med</source><year>2017</year><month>06</month><day>8</day><volume>376</volume><issue>23</issue><fpage>2205</fpage><lpage>2207</lpage><pub-id pub-id-type="doi">10.1056/NEJMp1705323</pub-id><pub-id pub-id-type="medline">28445656</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meeuws</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yue</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Huijben</surname><given-names>JA</given-names> </name><etal/></person-group><article-title>Common data elements: critical assessment of harmonization between current multi-center traumatic brain injury studies</article-title><source>J Neurotrauma</source><year>2020</year><month>06</month><day>1</day><volume>37</volume><issue>11</issue><fpage>1283</fpage><lpage>1290</lpage><pub-id pub-id-type="doi">10.1089/neu.2019.6867</pub-id><pub-id pub-id-type="medline">32000562</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Budin-Lj&#x00F8;sne</surname><given-names>I</given-names> </name><name name-style="western"><surname>Isaeva</surname><given-names>J</given-names> </name><name name-style="western"><surname>Knoppers</surname><given-names>BM</given-names> </name><etal/></person-group><article-title>Data sharing in large research consortia: experiences and recommendations from ENGAGE</article-title><source>Eur J Hum Genet</source><year>2014</year><month>03</month><volume>22</volume><issue>3</issue><fpage>317</fpage><lpage>321</lpage><pub-id pub-id-type="doi">10.1038/ejhg.2013.131</pub-id><pub-id pub-id-type="medline">23778872</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>El-Kareh</surname><given-names>R</given-names> </name><name name-style="western"><surname>Goel</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vineet</surname><given-names>FNU</given-names> </name><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name></person-group><article-title>An approach to improve LOINC mapping through augmentation of local test names</article-title><source>J Biomed Inform</source><year>2012</year><month>08</month><volume>45</volume><issue>4</issue><fpage>651</fpage><lpage>657</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2011.12.004</pub-id><pub-id pub-id-type="medline">22210167</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopanitsa</surname><given-names>G</given-names> </name></person-group><article-title>Application of a Regenstrief RELMA V.6.6 to map Russian laboratory terms to LOINC</article-title><source>Methods Inf Med</source><year>2016</year><volume>55</volume><issue>2</issue><fpage>177</fpage><lpage>181</lpage><pub-id pub-id-type="doi">10.3414/ME15-01-0068</pub-id><pub-id pub-id-type="medline">26666563</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zunner</surname><given-names>C</given-names> </name><name name-style="western"><surname>B&#x00FC;rkle</surname><given-names>T</given-names> </name><name name-style="western"><surname>Prokosch</surname><given-names>HU</given-names> </name><name name-style="western"><surname>Ganslandt</surname><given-names>T</given-names> </name></person-group><article-title>Mapping local laboratory interface terms to LOINC at a German university hospital using RELMA V.5: a semi-automated approach</article-title><source>J Am Med Inform Assoc</source><year>2013</year><volume>20</volume><issue>2</issue><fpage>293</fpage><lpage>297</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2012-001063</pub-id><pub-id pub-id-type="medline">22802268</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peters</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kapusnik-Uner</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Bodenreider</surname><given-names>O</given-names> </name></person-group><article-title>Methods for managing variation in clinical drug names</article-title><source>AMIA Annu Symp Proc</source><year>2010</year><month>11</month><day>13</day><volume>2010</volume><fpage>637</fpage><lpage>641</lpage><pub-id pub-id-type="medline">21347056</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Plasek</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Mahoney</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>FY</given-names> </name><name name-style="western"><surname>DiMaggio</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rocha</surname><given-names>RA</given-names> </name></person-group><article-title>Mapping Partners Master Drug Dictionary to RxNorm using an NLP-based approach</article-title><source>J Biomed Inform</source><year>2012</year><month>08</month><volume>45</volume><issue>4</issue><fpage>626</fpage><lpage>633</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2011.11.006</pub-id><pub-id pub-id-type="medline">22142948</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fidahussein</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vreeman</surname><given-names>DJ</given-names> </name></person-group><article-title>A corpus-based approach for automated LOINC mapping</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>1</issue><fpage>64</fpage><lpage>72</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2012-001159</pub-id><pub-id pub-id-type="medline">23676247</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Multiview Incomplete Knowledge Graph Integration with application to cross-institutional EHR data harmonization</article-title><source>J Biomed Inform</source><year>2022</year><month>09</month><volume>133</volume><fpage>104147</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2022.104147</pub-id><pub-id pub-id-type="medline">35872266</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bild</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Bluemke</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Burke</surname><given-names>GL</given-names> </name><etal/></person-group><article-title>Multi-ethnic study of atherosclerosis: objectives and design</article-title><source>Am J Epidemiol</source><year>2002</year><month>11</month><day>1</day><volume>156</volume><issue>9</issue><fpage>871</fpage><lpage>881</lpage><pub-id pub-id-type="doi">10.1093/aje/kwf113</pub-id><pub-id pub-id-type="medline">12397006</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fried</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Borhani</surname><given-names>NO</given-names> </name><name name-style="western"><surname>Enright</surname><given-names>P</given-names> </name><etal/></person-group><article-title>The cardiovascular health study: design and rationale</article-title><source>Ann Epidemiol</source><year>1991</year><month>02</month><volume>1</volume><issue>3</issue><fpage>263</fpage><lpage>276</lpage><pub-id pub-id-type="doi">10.1016/1047-2797(91)90005-w</pub-id><pub-id pub-id-type="medline">1669507</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>The Women&#x2019;s Health Initiative Study Group</collab></person-group><article-title>Design of the Women&#x2019;s Health Initiative Clinical Trial and Observational Study</article-title><source>Cont Clin Trials</source><year>1998</year><month>02</month><volume>19</volume><issue>1</issue><fpage>61</fpage><lpage>109</lpage><pub-id pub-id-type="doi">10.1016/S0197-2456(97)00078-0</pub-id><pub-id pub-id-type="medline">9492970</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mailman</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Feolo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>The NCBI dbGaP database of genotypes and phenotypes</article-title><source>Nat Genet</source><year>2007</year><month>10</month><volume>39</volume><issue>10</issue><fpage>1181</fpage><lpage>1186</lpage><pub-id pub-id-type="doi">10.1038/ng1007-1181</pub-id><pub-id pub-id-type="medline">17898773</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>S</given-names> </name></person-group><article-title>CODER: knowledge-infused cross-lingual medical term embedding for term normalization</article-title><source>J Biomed Inform</source><year>2022</year><month>02</month><volume>126</volume><fpage>103983</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2021.103983</pub-id><pub-id pub-id-type="medline">34990838</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Shareghi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Basaldella</surname><given-names>M</given-names> </name><name name-style="western"><surname>Collier</surname><given-names>N</given-names> </name></person-group><article-title>Self-alignment pretraining for biomedical entity representations</article-title><conf-name>Conference of the North American Chapter of the Association for Computational Linguistics</conf-name><conf-date>Jun 6-11, 2021</conf-date><fpage>4228</fpage><lpage>4238</lpage><pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.334</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hovy</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name></person-group><article-title>Pre-trained language models and their applications</article-title><source>Eng (Beijing)</source><year>2023</year><month>06</month><volume>25</volume><fpage>51</fpage><lpage>65</lpage><pub-id pub-id-type="doi">10.1016/j.eng.2022.04.024</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Unified medical language system (UMLS)</article-title><source>National Library of Medicine</source><access-date>2025-01-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nlm.nih.gov/research/umls/index.html">https://www.nlm.nih.gov/research/umls/index.html</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Revised supplementary material.</p><media xlink:href="medinform_v13i1e54133_app1.docx" xlink:title="DOCX File, 3457 KB"/></supplementary-material></app-group></back></article>