<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn></journal-meta><article-meta><article-id pub-id-type="publisher-id">54590</article-id><article-id pub-id-type="doi">10.2196/54590</article-id><title-group><article-title>Data Lake, Data Warehouse, Datamart, and Feature Store: Their Contributions to the Complete Data Reuse Pipeline</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lamer</surname><given-names>Antoine</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Saint-Dizier</surname><given-names>Chlo&#x00E9;</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Paris</surname><given-names>Nicolas</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chazard</surname><given-names>Emmanuel</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Univ. Lille, CHU Lille, ULR 2694-METRICS, Centre d'Etudes et de Recherche en Informatique M&#x00E9;dicale</institution>, <addr-line>Lille</addr-line>, <country>France</country></aff><aff id="aff2"><institution>F&#x00E9;d&#x00E9;ration r&#x00E9;gionale de recherche en psychiatrie et sant&#x00E9; mentale des Hauts-de-France</institution>, <addr-line>Saint-Andr&#x00E9;-lez-Lille</addr-line>, <country>France</country></aff><aff id="aff3"><institution>InterHop</institution>, <addr-line>Rennes</addr-line>, <country>France</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Anand</surname><given-names>R</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Reinecke</surname><given-names>Ines</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ahmadi</surname><given-names>Najia</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Steichen</surname><given-names>Olivier</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Antoine Lamer, PhD<email>antoine.lamer@univ-lille.fr</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>17</day><month>7</month><year>2024</year></pub-date><volume>12</volume><elocation-id>e54590</elocation-id><history><date date-type="received"><day>27</day><month>11</month><year>2023</year></date><date date-type="rev-recd"><day>11</day><month>03</month><year>2024</year></date><date date-type="accepted"><day>05</day><month>04</month><year>2024</year></date></history><copyright-statement>&#x00A9; Antoine Lamer, Chlo&#x00E9; Saint-Dizier, Nicolas Paris, Emmanuel Chazard. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 17.7.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2024/1/e54590"/><abstract><p>The growing adoption and use of health information technology has generated a wealth of clinical data in electronic format, offering opportunities for data reuse beyond direct patient care. However, as data are distributed across multiple software, it becomes challenging to cross-reference information between sources due to differences in formats, vocabularies, and technologies and the absence of common identifiers among software. To address these challenges, hospitals have adopted data warehouses to consolidate and standardize these data for research. Additionally, as a complement or alternative, data lakes store both source data and metadata in a detailed and unprocessed format, empowering exploration, manipulation, and adaptation of the data to meet specific analytical needs. Subsequently, datamarts are used to further refine data into usable information tailored to specific research questions. However, for efficient analysis, a feature store is essential to pivot and denormalize the data, simplifying queries. In conclusion, while data warehouses are crucial, data lakes, datamarts, and feature stores play essential and complementary roles in facilitating data reuse for research and analysis in health care.</p></abstract><kwd-group><kwd>data reuse</kwd><kwd>data lake</kwd><kwd>data warehouse</kwd><kwd>feature extraction</kwd><kwd>datamart</kwd><kwd>feature store</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Over the last few decades, the widespread adoption and use of health information systems (HISs) have transitioned a substantial amount of clinical data from manual to electronic format [<xref ref-type="bibr" rid="ref1">1</xref>]. HISs collect and deliver data for care, administrative, or billing purposes. In addition to these initial uses, HISs also offer opportunities for data reuse, defined as &#x201C;non-direct care use of personal health information&#x201D; [<xref ref-type="bibr" rid="ref2">2</xref>], such as research, quality of care, activity management, or public health [<xref ref-type="bibr" rid="ref3">3</xref>]. Hospitals have gradually adopted data warehouses to facilitate data reuse [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Even if the data warehouse is a popular concept, data reuse is not limited to feeding and querying a data warehouse. In this viewpoint, our objective is to outline the different components of the data reuse pipeline and how they complement and interconnect with each other. This definition is derived from our personal experiences and insights gained through collaboration with colleagues at various institutions [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Additionally, we draw on the collective experiences shared by professionals in the field, contributing to a comprehensive understanding of data reuse practices in diverse health care settings. The pipeline is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref> and detailed below. <xref ref-type="table" rid="table1">Table 1</xref> compares characteristics of each component. Last, <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides examples of data, structures, and architectures for each component of the data reuse pipeline.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Components of the complete pipeline for data reuse. EHR: electronic health records; ETL: extract-transform-load.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v12i1e54590_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of each component of the data reuse pipeline.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Software</td><td align="left" valign="bottom">Data lake</td><td align="left" valign="bottom">Data warehouse</td><td align="left" valign="bottom">Datamarts</td><td align="left" valign="bottom">Feature store</td></tr></thead><tbody><tr><td align="left" valign="top">Content</td><td align="left" valign="top">Data and metadata</td><td align="left" valign="top">Data and metadata</td><td align="left" valign="top">Data</td><td align="left" valign="top">Features</td><td align="left" valign="top">Features and metadata about feature</td></tr><tr><td align="left" valign="top">Architecture</td><td align="left" valign="top">Distributed</td><td align="left" valign="top">Centralized</td><td align="left" valign="top">Centralized</td><td align="left" valign="top">Centralized</td><td align="left" valign="top">Centralized</td></tr><tr><td align="left" valign="top">Detail level</td><td align="left" valign="top">Fine-grained</td><td align="left" valign="top">Fine-grained</td><td align="left" valign="top">Fine-grained</td><td align="left" valign="top">Aggregated</td><td align="left" valign="top">Aggregated</td></tr><tr><td align="left" valign="top">Data</td><td align="left" valign="top">Raw</td><td align="left" valign="top">Raw</td><td align="left" valign="top">Cleaned</td><td align="left" valign="top">Cleaned</td><td align="left" valign="top">Cleaned</td></tr><tr><td align="left" valign="top">Nomenclature</td><td align="left" valign="top">Heterogeneous</td><td align="left" valign="top">Heterogeneous</td><td align="left" valign="top">Standardized</td><td align="left" valign="top">Standardized</td><td align="left" valign="top">Standardized</td></tr><tr><td align="left" valign="top">Data model</td><td align="left" valign="top">Normalized</td><td align="left" valign="top">Normalized</td><td align="left" valign="top">Normalized</td><td align="left" valign="top">Normalized</td><td align="left" valign="top">Denormalized</td></tr><tr><td align="left" valign="top">Data structure</td><td align="left" valign="top">Row-oriented</td><td align="left" valign="top">Row-oriented</td><td align="left" valign="top">Row-oriented</td><td align="left" valign="top">Row-oriented</td><td align="left" valign="top">Column-oriented</td></tr><tr><td align="left" valign="top">Purpose</td><td align="left" valign="top">Transactional software purpose</td><td align="left" valign="top">Ad hoc exploratory queries</td><td align="left" valign="top">All purposes</td><td align="left" valign="top">Prespecified purpose</td><td align="left" valign="top">Prespecified purpose</td></tr></tbody></table></table-wrap></sec><sec id="s2"><title>Ethical Considerations</title><p>This study does not include human participants research (no human participants experimentation or intervention was conducted) and so does not require institutional review board approval.</p></sec><sec id="s3"><title>Health Information System</title><p>The raw data stored in the HIS are distributed across multiple software, making it impossible to cross-reference information between sources due to variations in data formats, ranging from tabular to hierarchical structures and free text [<xref ref-type="bibr" rid="ref9">9</xref>]. Different technologies and distinct identification schemes for patients, admissions, or any other records compound the complexity. Additionally, direct write access to the software databases is typically unavailable, as software editors rarely grant such privileges to prevent any potential disruption to routine software operation. In transactional software databases, data consist of meticulously organized and highly accurate records presented in rows. These records are collected with great precision to fulfill the specific functions of the software. Alongside the data, a wealth of metadata is also present, including information regarding data collection (eg, information on the individuals inputting data, record timestamps, and biomedical equipment identifiers), as well as software configurations. Notably, a significant portion of these metadata may not be directly relevant to our research purposes, as they primarily support the routine functioning of the software.</p></sec><sec id="s4"><title>The Data Lake</title><p>An optional first component of a comprehensive data reuse pipeline is the data lake [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. A data lake is a centralized, flexible, and scalable data storage system that ingests and stores raw data from multiple heterogeneous sources in its original format [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Data are stored in a fine-grained, row-oriented, and raw format, in a secure and cost-effective environment. These raw data still encompass diverse formats, from structured data to unstructured text documents, images, songs, videos, and sensor data, ensuring that a wide spectrum of information is readily available for various data analytics endeavors [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>The technologies implemented for the data lake can include the usual relational databases, such as PostgreSQL or Oracle, but also NoSQL databases and big data technologies, such as the Hadoop Distributed File System or Apache Hudi for the storage and Apache Spark, Hadoop MapReduce, or Apache Kudu for the data processing.</p><p>Unlike structured data typically integrated into data warehouses, the data lake refrains from immediate structuring or transformation, allowing for a more agile and adaptable approach. This flexibility enables exploration, manipulation, and, if necessary, transformation of the data to fulfill specific research or analytical requirements. By delaying the application of predefined data models, the data lake cultivates an environment where information can be uncovered without predetermined hypotheses. This includes insights that may not have been evident during the initial phases of data collection and storage. The system further facilitates on-the-fly query processing and data analysis [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>In a data pipeline without a data lake, it is essential to finalize the extract-transform-load (ETL) process before leveraging the data. This introduces a time delay, as it necessitates identifying relevant data in the HIS, updating the data warehouse data model for their accommodation, and subsequently designing and implementing the ETL.</p><p>In addition, when interpreting the results, if it becomes apparent that relevant data are missing for the analysis, it requires updating both the ETL process and the data model to incorporate the missing data. This iterative cycle of identifying, modifying, and reimplementing can lead to prolonged timelines and may hinder the agility of the data analysis process. Therefore, a data lake approach proves advantageous in providing a more flexible and dynamic environment for data exploration and analysis, potentially avoiding some of these challenges encountered in a traditional pipeline.</p></sec><sec id="s5"><title>The Data Warehouse</title><p>The data warehouse stands as the most prevalent component of the pipeline and acts as a centralized repository of integrated data from 1 or more disparate sources [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. It stores historical and current fine-grained data in a format optimized for further use. This involves a single storage technology, a consistent naming convention for tables and fields, and coherent identifiers across data sources. This is a departure from the data lake where all these elements varied between sources.</p><p>The data warehouse is supplied through an ETL process [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. The primary objective of this process is to select and extract relevant data from the HIS or other external resources [<xref ref-type="bibr" rid="ref19">19</xref>]. During this initial phase, the majority of metadata linked to software operations (such as usage logs or interface settings), monitors, and individuals inputting data are usually excluded. Indeed, these types of metadata do not relate to patient care information and would introduce an unnecessary volume of data. Subsequently, the ETL process enhances the raw data by identifying and correcting any abnormal or erroneous information. Following this refinement, data are integrated into a unified data model independent of the source software [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Notably, there is a strong focus on harmonizing identifiers from diverse data sources to ensure data integrity and streamline queries involving information from multiple origins. The ETL process is also responsible for regularly updating the data warehouse with new information recorded in the original data sources.</p><p>The data warehouse, as a relational database, is typically implemented using systems like PostgreSQL, Oracle, SQL Server, Apache Impala, or Netezza. However, for a data warehouse, exploring NoSQL technologies such as MongoDB, Cassandra, or Couchbase can also be interesting, offering advantages in handling unstructured or semistructured data, and providing scalability for large-scale data storage and retrieval [<xref ref-type="bibr" rid="ref20">20</xref>]. The ETL process can be developed using 2 types of technologies. The first one, with programming languages such as R (R Core Team), Python (Python Software Foundation), or Java (Oracle Corporation), can be used, coupled with a scheduler like Apache Airflow (Apache Software Foundation), to organize the execution of scripts and retrieval of logs and error messages. The second kind of application is graphical user interface software, such as Talend (Talend) or Pentaho (Hitachi Vantara). They do not require programming capacities, because graphical components, corresponding to data management operations, are organized through a drag-and-drop interface.</p><p>To foster collaboration among institutions and facilitate the sharing of tools, methods, and results, several initiatives have emerged to offer common data models (CDM). As a result, table and field names are standardized following a common nomenclature, and local vocabularies and terminologies are mapped to a shared vocabulary. Among these CDMs, the Observational Medical Outcomes Partnership CDM was developed by the Observational Health Data Sciences and Informatics community, which brings together multiple countries and thousands of users [<xref ref-type="bibr" rid="ref21">21</xref>] and led to methodological and practical advancements [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>As a result, the data warehouse functions as a unified, centralized, and normalized repository, for both fined-grained historical data and metadata, and continues to present information in a row-oriented format. The modeling approach presented by Inmon [<xref ref-type="bibr" rid="ref24">24</xref>] and described as a &#x201C;subject-oriented, nonvolatile, integrated, time-variant collection of data&#x201D; implies that data are stored persistently without any assumptions as to their future use, thus remaining open-ended in their usage.</p></sec><sec id="s6"><title>The Datamarts</title><p>While the data warehouse serves as a unique standardized repository, primarily dedicated to data storage, querying these data can be time-consuming due to the volume and distribution of data in the relational model. Furthermore, raw data integrated into the data warehouse may not be readily aligned with specific research or analytical questions, as these data lack the necessary aggregated features. For instance, the data warehouse retains all biological measurements (eg, potassium and sodium), while what will be stored in the datamart are the features related to the biology values, such as the occurrence of hypokalemia, hyperkalemia, hyponatremia, or hypernatremia. Thus, the datamart acts as a dedicated resource for transforming the data into usable and meaningful information [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. This transformation process involves feature extraction, achieved through the application of algorithms and domain-specific rules [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. The outcome is data that are tailored to address specific research questions or analytical needs. For instance, within a clinical setting, the datamart can convert raw mean arterial pressure values into a format suitable for detecting perioperative hypotension [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Moreover, datamarts can be organized in the form of online analytical processing (OLAP) cubes, offering a multidimensional view of the data [<xref ref-type="bibr" rid="ref27">27</xref>]. This cubical structure allows for in-depth analysis, enabling users to efficiently explore and navigate across various dimensions such as time, geography, or specific categories, gaining profound and contextualized insights. These datamarts are often modeled in either a snowflake or star schema, optimizing their structure for the creation of OLAP cubes. The star schema, with its central fact table surrounded by dimension tables, or the snowflake schema, which further normalizes dimension tables to minimize data redundancy, both serve to facilitate the creation of these OLAP cubes. Such schemas play a pivotal role in enhancing the efficiency of multidimensional data analysis within the OLAP environment, providing a structured framework for faster and more comprehensive insights.</p><p>In the context of health care, an example of an OLAP cube could encompass dimensions such as patient (eg, age and gender); time (eg, admission and discharge dates); medical conditions (eg, primary and secondary diagnoses and medical procedures); hospital unit (eg, information on services, departments, and bed types); health care provider (eg, physicians); and outcome (eg, length of stay, treatment outcomes, and medical costs). The cube would include various facts, such as the number of patients, average length of stay, and average treatment costs. This multidimensional structure allows health care professionals to conduct in-depth analyses, explore trends over time, compare costs across different hospital units, and assess the impact of medical interventions on patient outcomes [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>Datamarts, owing to the structured nature of their data, are typically stored on relational databases (eg, PostgreSQL, Oracle, and SQL Server) [<xref ref-type="bibr" rid="ref25">25</xref>]. In the case of OLAP cubes, this may include Apache Kylin or other proprietary OLAP tools built on relational databases [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>In contrast to Inmon&#x2019;s [<xref ref-type="bibr" rid="ref24">24</xref>] approach, the Kimball [<xref ref-type="bibr" rid="ref9">9</xref>] bottom-up approach places datamarts at the core, with their design driven primarily by business requirements. However, by directly developing datamarts, the Kimball approach may overlook some crucial data that were not initially identified as relevant during the business requirements phase.</p><p>As a result, the datamart stands as a centralized component for cleaned and aggregated features for dedicated purposes, still stored in row-oriented structure.</p></sec><sec id="s7"><title>The Feature Store</title><p>The feature store addresses the limitations of the traditional row-oriented, relational database structure typically used in datamarts. This architecture, which relies on multiple tables, may not fully meet various analytical requirements. For instance, effective statistical analysis often necessitates a single, flat file with column-oriented variables, mandating the transformation of data from a row-based to a column-based format within the feature store. This process streamlines data access, simplifying complex queries into straightforward selections from a single table. Consequently, the feature store emerges as a centralized repository housing well-documented, curated, and access-controlled features. In addition to features extracted from datamarts, which are often calculated by algorithms derived from business rules, the feature store can also receive features generated by machine learning algorithms [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>The design of the feature store aims to provide data scientists with direct access to these features, eliminating the need for additional data cleaning, aggregation, or pivoting [<xref ref-type="bibr" rid="ref31">31</xref>]. This specialized role enhances efficiency and promotes the use of high-quality, analysis-ready data, significantly contributing to the effectiveness of data-driven research in the health care organization. Notably, the feature store not only stores the features themselves but also their associated metadata, documenting how they were calculated and used [<xref ref-type="bibr" rid="ref31">31</xref>]. It ensures the preservation of all feature versions, guaranteeing the reproducibility of analyses.</p><p>When derived from business rules, features are stored in relational databases (eg, PostgreSQL, Oracle, and SQL Server) or in a NoSQL data store such as MongoDB to also store metadata. When features originate from machine learning models, they are stored and shared from big data platforms such as Databricks or Hopsworks [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref32">32</xref>].</p><p>As the final component of the data reuse pipeline, the feature store plays a pivotal role in various analytical applications within the health care organization. It significantly contributes to the creation of insightful dashboards and automated reports, delivering real-time and historical information. In research, its most crucial contribution lies in generating denormalized flat tables, similar to questionnaire data tailored for statistical analyses.</p></sec><sec id="s8" sec-type="conclusions"><title>Conclusions</title><p>In this opinion paper, we propose standardized nomenclature and definitions for the components of a data reuse pipeline. <xref ref-type="table" rid="table2">Table 2</xref> summarizes the advantages and limitations of each component in this pipeline.</p><p>While the data warehouse serves as a necessary initial stage, the integration of datamarts and a feature store enhances its effectiveness. Datamarts compute pertinent information from raw data, while the feature store organizes it into columns, streamlining data set construction. Additionally, the data lake emerges as a valuable resource for storing raw data in a single location, allowing for exploitation without having to wait for the entire pipeline to be developed.</p><p>Notably, in a data pipeline without a data lake, the requirement to complete the ETL process before analysis introduces delays. This involves identifying relevant data in the HIS, adapting the data warehouse data model, and implementing the ETL. Additionally, discovering missing data during result interpretation prompts iterative updates to both the ETL process and the data model, potentially prolonging timelines and hindering data analysis agility.</p><p>It is important to emphasize that the specific components and their characteristics described here are not rigidly fixed and can vary based on the unique organizational needs and configurations. For instance, the inclusion of a data lake and feature store is often discretionary, influenced by factors such as the scale and intricacy of source data, the quantity of features, the scope of research projects, the team&#x2019;s size, and the imperative for study reproducibility over time.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Advantages and disadvantages of the components of the data reuse pipeline.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Component</td><td align="left" valign="top">Advantages</td><td align="left" valign="top">Disadvantages</td></tr></thead><tbody><tr><td align="left" valign="top">Data lake</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>All data sources on the same server</p></list-item><list-item><p>Independence from source software</p></list-item><list-item><p>On-the-fly query processing and data analysis without the need for the complete development of an extract-transform-load (ETL) process</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Inconsistencies in data formats and structures</p></list-item><list-item><p>Lack of standard schema can make querying complex</p></list-item><list-item><p>Analyses reproducibility</p></list-item></list></td></tr><tr><td align="left" valign="top">Data warehouse</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Querying data from both administrative and biology systems is facilitated by the unified data model (ie, data from both systems are linked, and the model conventions are consistent)</p></list-item><list-item><p>Relevant data are retained at the finest level of detail (eg, dates, diagnoses, and all biology values), enabling the answering of numerous questions without necessarily identifying them beforehand</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>ETL process must be implemented to standardize the data</p></list-item><list-item><p>Multidimensional data model with several statistical units</p></list-item><list-item><p>Fine-grained data is not directly usable and adapted for statistical analysis or decision-making</p></list-item></list></td></tr><tr><td align="left" valign="top">Datamarts</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Features are ready to be used directly</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Features are still organized with a row-format (ie, 1 feature per row) in several datamarts</p></list-item></list></td></tr><tr><td align="left" valign="top">Feature store</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Using features directly, without the need for data management tasks such as joining datamarts or pivoting to reorganize features into columns</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Having developed the entire pipeline beforehand</p></list-item></list></td></tr></tbody></table></table-wrap></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CDM</term><def><p>common data model</p></def></def-item><def-item><term id="abb2">ETL</term><def><p>extract-transform-load</p></def></def-item><def-item><term id="abb3">HIS</term><def><p>health information system</p></def></def-item><def-item><term id="abb4">OLAP</term><def><p>online analytical processing</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adler-Milstein</surname><given-names>J</given-names> </name><name name-style="western"><surname>DesRoches</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Kralovec</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Electronic health record adoption in US hospitals: progress continues, but challenges persist</article-title><source>Health Aff</source><year>2015</year><month>12</month><volume>34</volume><issue>12</issue><fpage>2174</fpage><lpage>2180</lpage><pub-id pub-id-type="doi">10.1377/hlthaff.2015.0992</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Safran</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bloomrosen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hammond</surname><given-names>WE</given-names> </name><etal/></person-group><article-title>Toward a national framework for the secondary use of health data: an American Medical Informatics Association white paper</article-title><source>J Am Med Inform Assoc</source><year>2007</year><month>01</month><volume>14</volume><issue>1</issue><fpage>1</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1197/jamia.M2273</pub-id><pub-id pub-id-type="medline">17077452</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Safran</surname><given-names>C</given-names> </name></person-group><article-title>Reuse of clinical data</article-title><source>Yearb Med Inform</source><year>2014</year><month>08</month><day>15</day><volume>9</volume><issue>1</issue><fpage>52</fpage><lpage>54</lpage><pub-id pub-id-type="doi">10.15265/IY-2014-0013</pub-id><pub-id pub-id-type="medline">25123722</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wisniewski</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Kieszkowski</surname><given-names>P</given-names> </name><name name-style="western"><surname>Zagorski</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Trick</surname><given-names>WE</given-names> </name><name name-style="western"><surname>Sommers</surname><given-names>M</given-names> </name><name name-style="western"><surname>Weinstein</surname><given-names>RA</given-names> </name></person-group><article-title>Development of a clinical data warehouse for hospital infection control</article-title><source>J Am Med Inform Assoc</source><year>2003</year><month>09</month><volume>10</volume><issue>5</issue><fpage>454</fpage><lpage>462</lpage><pub-id pub-id-type="doi">10.1197/jamia.M1299</pub-id><pub-id pub-id-type="medline">12807807</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lamer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Moussa</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Marcilly</surname><given-names>R</given-names> </name><name name-style="western"><surname>Logier</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vallet</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tavernier</surname><given-names>B</given-names> </name></person-group><article-title>Development and usage of an anesthesia data warehouse: lessons learnt from a 10-year project</article-title><source>J Clin Monit Comput</source><year>2023</year><month>04</month><volume>37</volume><issue>2</issue><fpage>461</fpage><lpage>472</lpage><pub-id pub-id-type="doi">10.1007/s10877-022-00898-y</pub-id><pub-id pub-id-type="medline">35933465</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chazard</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ficheur</surname><given-names>G</given-names> </name><name name-style="western"><surname>Caron</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Secondary use of healthcare structured data: the challenge of domain-knowledge based extraction of features</article-title><source>Stud Health Technol Inform</source><year>2018</year><volume>255</volume><fpage>15</fpage><lpage>19</lpage><pub-id pub-id-type="medline">30306898</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lamer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fruchart</surname><given-names>M</given-names> </name><name name-style="western"><surname>Paris</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Standardized description of the feature extraction process to transform raw data into meaningful information for enhancing data reuse: consensus study</article-title><source>JMIR Med Inform</source><year>2022</year><month>10</month><day>17</day><volume>10</volume><issue>10</issue><fpage>e38936</fpage><pub-id pub-id-type="doi">10.2196/38936</pub-id><pub-id pub-id-type="medline">36251369</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Doutreligne</surname><given-names>M</given-names> </name><name name-style="western"><surname>Degremont</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jachiet</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Lamer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tannier</surname><given-names>X</given-names> </name></person-group><article-title>Good practices for clinical data warehouse implementation: a case study in France</article-title><source>PLOS Digit Health</source><year>2023</year><month>07</month><volume>2</volume><issue>7</issue><fpage>e0000298</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000298</pub-id><pub-id pub-id-type="medline">37410797</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kimball</surname><given-names>R</given-names> </name></person-group><source>The Data Warehouse Lifecycle Toolkit: Expert Methods for Designing, Developing, and Deploying Data Warehouses</source><year>1998</year><publisher-name>John Wiley &#x0026; Sons</publisher-name><pub-id pub-id-type="other">978-0-471-25547-5</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wieder</surname><given-names>P</given-names> </name><name name-style="western"><surname>Nolte</surname><given-names>H</given-names> </name></person-group><article-title>Toward data lakes as central building blocks for data management and analysis</article-title><source>Front Big Data</source><year>2022</year><month>08</month><volume>5</volume><fpage>945720</fpage><pub-id pub-id-type="doi">10.3389/fdata.2022.945720</pub-id><pub-id pub-id-type="medline">36072823</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Madera</surname><given-names>C</given-names> </name><name name-style="western"><surname>Laurent</surname><given-names>A</given-names> </name></person-group><article-title>The next information architecture evolution: the data lake wave</article-title><conf-name>MEDES&#x2019;16: The 8th International Conference on ManagEment of Digital EcoSystems</conf-name><conf-date>Nov 1 to 4, 2016</conf-date><conf-loc>Biarritz, France</conf-loc><fpage>174</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1145/3012071.3012077</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarramia</surname><given-names>D</given-names> </name><name name-style="western"><surname>Claude</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ogereau</surname><given-names>F</given-names> </name><name name-style="western"><surname>Mezhoud</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mailhot</surname><given-names>G</given-names> </name></person-group><article-title>CEBA: a data lake for data sharing and environmental monitoring</article-title><source>Sensors (Basel)</source><year>2022</year><month>04</month><day>2</day><volume>22</volume><issue>7</issue><fpage>2733</fpage><pub-id pub-id-type="doi">10.3390/s22072733</pub-id><pub-id pub-id-type="medline">35408347</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Che</surname><given-names>H</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>Y</given-names> </name></person-group><article-title>On the logical design of a prototypical data lake system for biological resources</article-title><source>Front Bioeng Biotechnol</source><year>2020</year><month>09</month><volume>8</volume><fpage>553904</fpage><pub-id pub-id-type="doi">10.3389/fbioe.2020.553904</pub-id><pub-id pub-id-type="medline">33117777</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>HV</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rao</surname><given-names>BD</given-names> </name><name name-style="western"><surname>J</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Rao</surname><given-names>BD</given-names> </name></person-group><article-title>Design an efficient data driven decision support system to predict flooding by analysing heterogeneous and multiple data sources using data lake</article-title><source>MethodsX</source><year>2023</year><month>12</month><volume>11</volume><fpage>102262</fpage><pub-id pub-id-type="doi">10.1016/j.mex.2023.102262</pub-id><pub-id pub-id-type="medline">37448950</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hai</surname><given-names>R</given-names> </name><name name-style="western"><surname>Koutras</surname><given-names>C</given-names> </name><name name-style="western"><surname>Quix</surname><given-names>C</given-names> </name><name name-style="western"><surname>Jarke</surname><given-names>M</given-names> </name></person-group><article-title>Data lakes: a survey of functions and systems</article-title><source>IEEE Trans Knowl Data Eng</source><year>2023</year><month>12</month><day>1</day><volume>35</volume><issue>12</issue><fpage>12571</fpage><lpage>12590</lpage><pub-id pub-id-type="doi">10.1109/TKDE.2023.3270101</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jannot</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Zapletal</surname><given-names>E</given-names> </name><name name-style="western"><surname>Avillach</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mamzer</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Burgun</surname><given-names>A</given-names> </name><name name-style="western"><surname>Degoulet</surname><given-names>P</given-names> </name></person-group><article-title>The Georges Pompidou University hospital clinical data warehouse: a 8-years follow-up experience</article-title><source>Int J Med Inform</source><year>2017</year><month>06</month><volume>102</volume><fpage>21</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2017.02.006</pub-id><pub-id pub-id-type="medline">28495345</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>F</given-names> </name><name name-style="western"><surname>Mccarthy</surname><given-names>DP</given-names> </name><etal/></person-group><article-title>Research data warehouse: using electronic health records to conduct population-based observational studies</article-title><source>JAMIA Open</source><year>2023</year><month>07</month><volume>6</volume><issue>2</issue><fpage>ad039</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooad039</pub-id><pub-id pub-id-type="medline">37359950</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fleuren</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Dam</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Tonutti</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The Dutch Data Warehouse, a multicenter and full-admission electronic health records database for critically ill COVID-19 patients</article-title><source>Crit Care</source><year>2021</year><month>08</month><day>23</day><volume>25</volume><issue>1</issue><fpage>304</fpage><pub-id pub-id-type="doi">10.1186/s13054-021-03733-z</pub-id><pub-id pub-id-type="medline">34425864</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Agapito</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zucco</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cannataro</surname><given-names>M</given-names> </name></person-group><article-title>COVID-WAREHOUSE: a data warehouse of Italian COVID-19, pollution, and climate data</article-title><source>Int J Environ Res Public Health</source><year>2020</year><month>08</month><day>3</day><volume>17</volume><issue>15</issue><fpage>5596</fpage><pub-id pub-id-type="doi">10.3390/ijerph17155596</pub-id><pub-id pub-id-type="medline">32756428</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McClay</surname><given-names>W</given-names> </name></person-group><article-title>A Magnetoencephalographic/encephalographic (MEG/EEG) brain-computer interface driver for interactive iOS mobile videogame applications utilizing the Hadoop Ecosystem, MongoDB, and Cassandra NoSQL databases</article-title><source>Diseases</source><year>2018</year><month>09</month><day>28</day><volume>6</volume><issue>4</issue><fpage>89</fpage><pub-id pub-id-type="doi">10.3390/diseases6040089</pub-id><pub-id pub-id-type="medline">30274210</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Blacketer</surname><given-names>C</given-names> </name></person-group><source>The Book of OHDSI</source><year>2021</year><access-date>2024-11-09</access-date><publisher-name>Observational Health Data Sciences and Informatics</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://ohdsi.github.io/TheBookOfOhdsi/">https://ohdsi.github.io/TheBookOfOhdsi/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schuemie</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Gini</surname><given-names>R</given-names> </name><name name-style="western"><surname>Coloma</surname><given-names>PM</given-names> </name><etal/></person-group><article-title>Replication of the OMOP experiment in Europe: evaluating methods for risk identification in electronic health record databases</article-title><source>Drug Saf</source><year>2013</year><month>10</month><volume>36</volume><issue>S1</issue><fpage>S159</fpage><lpage>S169</lpage><pub-id pub-id-type="doi">10.1007/s40264-013-0109-8</pub-id><pub-id pub-id-type="medline">24166232</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lane</surname><given-names>JCE</given-names> </name><name name-style="western"><surname>Weaver</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kostka</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Risk of hydroxychloroquine alone and in combination with azithromycin in the treatment of rheumatoid arthritis: a multinational, retrospective study</article-title><source>Lancet Rheumatol</source><year>2020</year><month>11</month><volume>2</volume><issue>11</issue><fpage>e698</fpage><lpage>e711</lpage><pub-id pub-id-type="doi">10.1016/S2665-9913(20)30276-9</pub-id><pub-id pub-id-type="medline">32864627</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Inmon</surname><given-names>WH</given-names> </name></person-group><source>Building the Data Warehouse</source><year>1992</year><publisher-name>Wiley</publisher-name><pub-id pub-id-type="other">978-0-471-56960-2</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hinchcliff</surname><given-names>M</given-names> </name><name name-style="western"><surname>Just</surname><given-names>E</given-names> </name><name name-style="western"><surname>Podlusky</surname><given-names>S</given-names> </name><name name-style="western"><surname>Varga</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>RW</given-names> </name><name name-style="western"><surname>Kibbe</surname><given-names>WA</given-names> </name></person-group><article-title>Text data extraction for a prospective, research-focused data mart: implementation and validation</article-title><source>BMC Med Inform Decis Mak</source><year>2012</year><month>09</month><day>13</day><volume>12</volume><fpage>106</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-12-106</pub-id><pub-id pub-id-type="medline">22970696</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>YJ</given-names> </name><etal/></person-group><article-title>Development of clinical data mart of HMG-CoA reductase inhibitor for varied clinical research</article-title><source>Endocrinol Metab (Seoul)</source><year>2017</year><month>03</month><volume>32</volume><issue>1</issue><fpage>90</fpage><lpage>98</lpage><pub-id pub-id-type="doi">10.3803/EnM.2017.32.1.90</pub-id><pub-id pub-id-type="medline">28256114</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hristovski</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rogac</surname><given-names>M</given-names> </name><name name-style="western"><surname>Markota</surname><given-names>M</given-names> </name></person-group><article-title>Using data warehousing and OLAP in public health care</article-title><source>Proc AMIA Symp</source><year>2000</year><fpage>369</fpage><lpage>373</lpage><pub-id pub-id-type="medline">11079907</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Seidel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>C</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>DA</given-names> </name></person-group><article-title>Breaking the 80:20 rule in health research using large administrative data sets</article-title><source>Health Informatics J</source><year>2023</year><volume>29</volume><issue>2</issue><fpage>146045822311805</fpage><pub-id pub-id-type="doi">10.1177/14604582231180581</pub-id><pub-id pub-id-type="medline">37269132</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ranawade</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Navale</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dhamal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Deshpande</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ghuge</surname><given-names>C</given-names> </name></person-group><article-title>Online analytical processing on Hadoop using Apache Kylin</article-title><source>Int J Appl Inf Syst</source><year>2017</year><month>05</month><day>5</day><volume>12</volume><issue>2</issue><fpage>1</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.5120/ijais2017451682</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Armgarth</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pantzare</surname><given-names>S</given-names> </name><name name-style="western"><surname>Arven</surname><given-names>P</given-names> </name><etal/></person-group><article-title>A digital nervous system aiming toward personalized IoT healthcare</article-title><source>Sci Rep</source><year>2021</year><month>04</month><day>8</day><volume>11</volume><issue>1</issue><fpage>7757</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-87177-z</pub-id><pub-id pub-id-type="medline">33833303</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Woodhouse</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Portwood</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Andorf</surname><given-names>CM</given-names> </name></person-group><article-title>Maize Feature Store: a centralized resource to manage and analyze curated maize multi-omics features for machine learning applications</article-title><source>Database (Oxford)</source><year>2023</year><month>11</month><day>6</day><volume>2023</volume><fpage>baad078</fpage><pub-id pub-id-type="doi">10.1093/database/baad078</pub-id><pub-id pub-id-type="medline">37935586</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajendran</surname><given-names>S</given-names> </name><name name-style="western"><surname>Obeid</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Binol</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Cloud-based federated learning implementation across medical centers</article-title><source>JCO Clin Cancer Inform</source><year>2021</year><month>01</month><volume>5</volume><fpage>1</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.1200/CCI.20.00060</pub-id><pub-id pub-id-type="medline">33411624</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Comparison of data, structures, and architectures of components of the data reuse pipeline.</p><media xlink:href="medinform_v12i1e54590_app1.docx" xlink:title="DOCX File, 68 KB"/></supplementary-material></app-group></back></article>