@Article{info:doi/10.2196/65566, author="Bak, Marieke and Hartman, Laura and Graafland, Charlotte and Korfage, J. Ida and Buyx, Alena and Schermer, Maartje and ", title="Ethical Design of Data-Driven Decision Support Tools for Improving Cancer Care: Embedded Ethics Review of the 4D PICTURE Project", journal="JMIR Cancer", year="2025", month="Apr", day="10", volume="11", pages="e65566", keywords="shared decision-making", keywords="oncology", keywords="IT", keywords="ethics", keywords="decision support tools", keywords="big data", keywords="medical decision-making", keywords="artificial intelligence", doi="10.2196/65566", url="https://cancer.jmir.org/2025/1/e65566" } @Article{info:doi/10.2196/70983, author="Schmit, D. Cason and O'Connell, Curry Meghan and Shewbrooks, Sarah and Abourezk, Charles and Cochlin, J. Fallon and Doerr, Megan and Kum, Hye-Chung", title="Dying in Darkness: Deviations From Data Sharing Ethics in the US Public Health System and the Data Genocide of American Indian and Alaska Native Communities", journal="J Med Internet Res", year="2025", month="Mar", day="26", volume="27", pages="e70983", keywords="ethics", keywords="information dissemination", keywords="indigenous peoples", keywords="public health surveillance", keywords="privacy", keywords="data sharing", keywords="deidentification", keywords="data anonymization", keywords="public health ethics", keywords="data governance", doi="10.2196/70983", url="https://www.jmir.org/2025/1/e70983" } @Article{info:doi/10.2196/59094, author="Dawadi, Research and Inoue, Mai and Tay, Ting Jie and Martin-Morales, Agustin and Vu, Thien and Araki, Michihiro", title="Disease Prediction Using Machine Learning on Smartphone-Based Eye, Skin, and Voice Data: Scoping Review", journal="JMIR AI", year="2025", month="Mar", day="25", volume="4", pages="e59094", keywords="literature review", keywords="machine learning", keywords="smartphone", keywords="health diagnosis", abstract="Background: The application of machine learning methods to data generated by ubiquitous devices like smartphones presents an opportunity to enhance the quality of health care and diagnostics. Smartphones are ideal for gathering data easily, providing quick feedback on diagnoses, and proposing interventions for health improvement. Objective: We reviewed the existing literature to gather studies that have used machine learning models with smartphone-derived data for the prediction and diagnosis of health anomalies. We divided the studies into those that used machine learning models by conducting experiments to retrieve data and predict diseases, and those that used machine learning models on publicly available databases. The details of databases, experiments, and machine learning models are intended to help researchers working in the fields of machine learning and artificial intelligence in the health care domain. Researchers can use the information to design their experiments or determine the databases they could analyze. Methods: A comprehensive search of the PubMed and IEEE Xplore databases was conducted, and an in-house keyword screening method was used to filter the articles based on the content of their titles and abstracts. Subsequently, studies related to the 3 areas of voice, skin, and eye were selected and analyzed based on how data for machine learning models were extracted (ie, the use of publicly available databases or through experiments). The machine learning methods used in each study were also noted. Results: A total of 49 studies were identified as being relevant to the topic of interest, and among these studies, there were 31 different databases and 24 different machine learning methods. Conclusions: The results provide a better understanding of how smartphone data are collected for predicting different diseases and what kinds of machine learning methods are used on these data. Similarly, publicly available databases having smartphone-based data that can be used for the diagnosis of various diseases have been presented. Our screening method could be used or improved in future studies, and our findings could be used as a reference to conduct similar studies, experiments, or statistical analyses. ", doi="10.2196/59094", url="https://ai.jmir.org/2025/1/e59094" } @Article{info:doi/10.2196/67033, author="Hao, Jie and Chen, Zhenli and Peng, Qinglong and Zhao, Liang and Zhao, Wanqing and Cong, Shan and Li, Junlian and Li, Jiao and Qian, Qing and Sun, Haixia", title="Prompt Framework for Extracting Scale-Related Knowledge Entities from Chinese Medical Literature: Development and Evaluation Study", journal="J Med Internet Res", year="2025", month="Mar", day="18", volume="27", pages="e67033", keywords="prompt engineering", keywords="named entity recognition", keywords="in-context learning", keywords="large language model", keywords="Chinese medical literature", keywords="measurement-based care", keywords="framework", keywords="prompt", keywords="prompt framework", keywords="scale", keywords="China", keywords="medical literature", keywords="MBC", keywords="LLM", keywords="MedScaleNER", keywords="retrieval", keywords="information retrieval", keywords="dataset", keywords="artificial intelligence", keywords="AI", abstract="Background: Measurement-based care improves patient outcomes by using standardized scales, but its widespread adoption is hindered by the lack of accessible and structured knowledge, particularly in unstructured Chinese medical literature. Extracting scale-related knowledge entities from these texts is challenging due to limited annotated data. While large language models (LLMs) show promise in named entity recognition (NER), specialized prompting strategies are needed to accurately recognize medical scale-related entities, especially in low-resource settings. Objective: This study aims to develop and evaluate MedScaleNER, a task-oriented prompt framework designed to optimize LLM performance in recognizing medical scale-related entities from Chinese medical literature. Methods: MedScaleNER incorporates demonstration retrieval within in-context learning, chain-of-thought prompting, and self-verification strategies to improve performance. The framework dynamically retrieves optimal examples using a k-nearest neighbors approach and decomposes the NER task into two subtasks: entity type identification and entity labeling. Self-verification ensures the reliability of the final output. A dataset of manually annotated Chinese medical journal papers was constructed, focusing on three key entity types: scale names, measurement concepts, and measurement items. Experiments were conducted by varying the number of examples and the proportion of training data to evaluate performance in low-resource settings. Additionally, MedScaleNER's performance was compared with locally fine-tuned models. Results: The CMedS-NER (Chinese Medical Scale Corpus for Named Entity Recognition) dataset, containing 720 papers with 27,499 manually annotated scale-related knowledge entities, was used for evaluation. Initial experiments identified GLM-4-0520 as the best-performing LLM among six tested models. When applied with GLM-4-0520, MedScaleNER significantly improved NER performance for scale-related entities, achieving a macro F1-score of 59.64\% in an exact string match with the full training dataset. The highest performance was achieved with 20-shot demonstrations. Under low-resource scenarios (eg, 1\% of the training data), MedScaleNER outperformed all tested locally fine-tuned models. Ablation studies highlighted the importance of demonstration retrieval and self-verification in improving model reliability. Error analysis revealed four main types of mistakes: identification errors, type errors, boundary errors, and missing entities, indicating areas for further improvement. Conclusions: MedScaleNER advances the application of LLMs and prompts engineering for specialized NER tasks in Chinese medical literature. By addressing the challenges of unstructured texts and limited annotated data, MedScaleNER's adaptability to various biomedical contexts supports more efficient and reliable knowledge extraction, contributing to broader measurement-based care implementation and improved clinical and research outcomes. ", doi="10.2196/67033", url="https://www.jmir.org/2025/1/e67033" } @Article{info:doi/10.2196/60548, author="Donkin, Liesje and Henry, Nathan and Kercher, Amy and Pedersen, Mangor and Wilson, Holly and Chan, Yan Amy Hai", title="Effective Recruitment or Bot Attack? The Challenge of Internet-Based Research Surveys and Recommendations to Reduce Risk and Improve Robustness", journal="Interact J Med Res", year="2025", month="Mar", day="14", volume="14", pages="e60548", keywords="internet-based research", keywords="research methodology", keywords="surveys", keywords="data integrity", keywords="bot attacks", keywords="technology", keywords="data manipulation", keywords="spam", keywords="false", keywords="falsification", keywords="fraudulent", keywords="fraud", keywords="bots", keywords="research methods", keywords="data collection", keywords="verify", keywords="verification", keywords="participants", doi="10.2196/60548", url="https://www.i-jmr.org/2025/1/e60548" } @Article{info:doi/10.2196/64354, author="Ehrig, Molly and Bullock, S. Garrett and Leng, Iris Xiaoyan and Pajewski, M. Nicholas and Speiser, Lynn Jaime", title="Imputation and Missing Indicators for Handling Missing Longitudinal Data: Data Simulation Analysis Based on Electronic Health Record Data", journal="JMIR Med Inform", year="2025", month="Mar", day="13", volume="13", pages="e64354", keywords="missing indicator method", keywords="missing data", keywords="imputation", keywords="longitudinal data", keywords="electronic health record data", keywords="electronic health records", keywords="EHR", keywords="simulation study", keywords="clinical prediction model", keywords="prediction model", keywords="older adults", keywords="falls", keywords="logistic regression", keywords="prediction modeling", abstract="Background: Missing data in electronic health records are highly prevalent and result in analytical concerns such as heterogeneous sources of bias and loss of statistical power. One simple analytic method for addressing missing or unknown covariate values is to treat missingness for a particular variable as a category onto itself, which we refer to as the missing indicator method. For cross-sectional analyses, recent work suggested that there was minimal benefit to the missing indicator method; however, it is unclear how this approach performs in the setting of longitudinal data, in which correlation among clustered repeated measures may be leveraged for potentially improved model performance. Objectives: This study aims to conduct a simulation study to evaluate whether the missing indicator method improved model performance and imputation accuracy for longitudinal data mimicking an application of developing a clinical prediction model for falls in older adults based on electronic health record data. Methods: We simulated a longitudinal binary outcome using mixed effects logistic regression that emulated a falls assessment at annual follow-up visits. Using multivariate imputation by chained equations, we simulated time-invariant predictors such as sex and medical history, as well as dynamic predictors such as physical function, BMI, and medication use. We induced missing data in predictors under scenarios that had both random (missing at random) and dependent missingness (missing not at random). We evaluated aggregate performance using the area under the receiver operating characteristic curve (AUROC) for models with and with no missing indicators as predictors, as well as complete case analysis, across simulation replicates. We evaluated imputation quality using normalized root-mean-square error for continuous variables and percent falsely classified for categorical variables. Results: Independent of the mechanism used to simulate missing data (missing at random or missing not at random), overall model performance via AUROC was similar regardless of whether missing indicators were included in the model. The root-mean-square error and percent falsely classified measures were similar for models including missing indicators versus those with no missing indicators. Model performance and imputation quality were similar regardless of whether the outcome was related to missingness. Imputation with or with no missing indicators had similar mean values of AUROC compared with complete case analysis, although complete case analysis had the largest range of values. Conclusions: The results of this study suggest that the inclusion of missing indicators in longitudinal data modeling neither improves nor worsens overall performance or imputation accuracy. Future research is needed to address whether the inclusion of missing indicators is useful in prediction modeling with longitudinal data in different settings, such as high dimensional data analysis. ", doi="10.2196/64354", url="https://medinform.jmir.org/2025/1/e64354" } @Article{info:doi/10.2196/63216, author="Yang, Zhongbao and Xu, Shan-Shan and Liu, Xiaozhu and Xu, Ningyuan and Chen, Yuqing and Wang, Shuya and Miao, Ming-Yue and Hou, Mengxue and Liu, Shuai and Zhou, Yi-Min and Zhou, Jian-Xin and Zhang, Linlin", title="Large Language Model--Based Critical Care Big Data Deployment and Extraction: Descriptive Analysis", journal="JMIR Med Inform", year="2025", month="Mar", day="12", volume="13", pages="e63216", keywords="big data", keywords="critical care--related databases", keywords="database deployment", keywords="large language model", keywords="database extraction", keywords="intensive care unit", keywords="ICU", keywords="GPT", keywords="artificial intelligence", keywords="AI", keywords="LLM", abstract="Background: Publicly accessible critical care--related databases contain enormous clinical data, but their utilization often requires advanced programming skills. The growing complexity of large databases and unstructured data presents challenges for clinicians who need programming or data analysis expertise to utilize these systems directly. Objective: This study aims to simplify critical care--related database deployment and extraction via large language models. Methods: The development of this platform was a 2-step process. First, we enabled automated database deployment using Docker container technology, with incorporated web-based analytics interfaces Metabase and Superset. Second, we developed the intensive care unit--generative pretrained transformer (ICU-GPT), a large language model fine-tuned on intensive care unit (ICU) data that integrated LangChain and Microsoft AutoGen. Results: The automated deployment platform was designed with user-friendliness in mind, enabling clinicians to deploy 1 or multiple databases in local, cloud, or remote environments without the need for manual setup. After successfully overcoming GPT's token limit and supporting multischema data, ICU-GPT could generate Structured Query Language (SQL) queries and extract insights from ICU datasets based on request input. A front-end user interface was developed for clinicians to achieve code-free SQL generation on the web-based client. Conclusions: By harnessing the power of our automated deployment platform and ICU-GPT model, clinicians are empowered to easily visualize, extract, and arrange critical care--related databases more efficiently and flexibly than manual methods. Our research could decrease the time and effort spent on complex bioinformatics methods and advance clinical research. ", doi="10.2196/63216", url="https://medinform.jmir.org/2025/1/e63216" } @Article{info:doi/10.2196/60649, author="Dekel, Dana and Marchant, Amanda and Del Pozo Banos, Marcos and Mhereeg, Mohamed and Lee, Chim Sze and John, Ann", title="Exploring the Views of Young People, Including Those With a History of Self-Harm, on the Use of Their Routinely Generated Data for Mental Health Research: Web-Based Cross-Sectional Survey Study", journal="JMIR Ment Health", year="2025", month="Mar", day="12", volume="12", pages="e60649", keywords="self-harm", keywords="mental health", keywords="big data", keywords="survey", keywords="youth", abstract="Background: Secondary use of routinely collected health care data has great potential benefits in epidemiological studies primarily due to the large scale of preexisting data. Objective: This study aimed to engage respondents with and without a history of self-harm, gain insight into their views on the use of their data for research, and determine whether there were any differences in opinions between the 2 groups. Methods: We examined young people's views on the use of their routinely collected data for mental health research through a web-based survey, evaluating any differences between those with and without a history of self-harm. Results: A total of 1765 respondents aged 16 to 24 years were included. Respondents' views were mostly positive toward the use and linkage of their data for research purposes for public benefit, particularly with regard to the use of health care data (mental health or otherwise), and generally echoed existing evidence on the opinions of older age groups. Individuals who reported a history of self-harm and subsequently contacted health services more often reported being ``extremely likely'' or ``likely'' to share mental health data (contacted: 209/609, 34.3\%; 95\% CI 28.0-41.2; not contacted: 169/782, 21.6\%; 95\% CI 15.8-28.7) and physical health data (contacted: 117/609, 19.2\%; 95\% CI 12.7-27.8; not contacted: 96/782, 12.3\%; 95\% CI 6.7-20.9) compared with those who had not contacted services. Respondents were overall less likely to want to share their social media data, which they considered to be more personal compared to their health care data. Respondents stressed the importance of anonymity and the need for an appropriate ethical framework. Conclusions: Young people are aware, and they care about how their data are being used and for what purposes, irrespective of having a history of self-harm. They are largely positive about the use of health care data (mental or physical) for research and generally echo the opinions of older age groups raising issues around data security and the use of data for the public interest. ", doi="10.2196/60649", url="https://mental.jmir.org/2025/1/e60649" } @Article{info:doi/10.2196/59377, author="Gao, Jing and Jie, Xu and Yao, Yujun and Xue, Jingdong and Chen, Lei and Chen, Ruiyao and Chen, Jiayuan and Cheng, Weiwei", title="Fetal Birth Weight Prediction in the Third Trimester: Retrospective Cohort Study and Development of an Ensemble Model", journal="JMIR Pediatr Parent", year="2025", month="Mar", day="10", volume="8", pages="e59377", keywords="fetal birthweight", keywords="ensemble learning model", keywords="machine learning", keywords="prediction model", keywords="ultrasonography", keywords="macrosomia", keywords="low birth weight", keywords="birth weight", keywords="fetal", keywords="AI", keywords="artificial intelligence", keywords="prenatal", keywords="prenatal care", keywords="Shanghai", keywords="neonatal", keywords="maternal", keywords="parental", abstract="Background: Accurate third-trimester birth weight prediction is vital for reducing adverse outcomes, and machine learning (ML) offers superior precision over traditional ultrasound methods. Objective: This study aims to develop an ML model on the basis of clinical big data for accurate prediction of birth weight in the third trimester of pregnancy, which can help reduce adverse maternal and fetal outcomes. Methods: From January 1, 2018 to December 31, 2019, a retrospective cohort study involving 16,655 singleton live births without congenital anomalies (>28 weeks of gestation) was conducted in a tertiary first-class hospital in Shanghai. The initial set of data was divided into a train set for algorithm development and a test set on which the algorithm was divided in a ratio of 4:1. We extracted maternal and neonatal delivery outcomes, as well as parental demographics, obstetric clinical data, and sonographic fetal biometry, from electronic medical records. A total of 5 basic ML algorithms, including Ridge, SVM, Random Forest, extreme gradient boosting (XGBoost), and Multi-Layer Perceptron, were used to develop the prediction model, which was then averaged into an ensemble learning model. The models were compared using accuracy, mean squared error, root mean squared error, and mean absolute error. International Peace Maternity and Child Health Hospital's Research Ethics Committee granted ethical approval for the usage of patient information (GKLW2021-20). Results: Train and test sets contained a total of 13,324 and 3331 cases, respectively. From a total of 59 variables, we selected 17 variables that were readily available for the ``few feature model,'' which achieved high predictive power with an accuracy of 81\% and significantly exceeded ultrasound formula methods. In addition, our model maintained superior performance for low birth weight and macrosomic fetal populations. Conclusions: Our research investigated an innovative artificial intelligence model for predicting fetal birth weight and maximizing health care resource use. In the era of big data, our model improves maternal and fetal outcomes and promotes precision medicine. ", doi="10.2196/59377", url="https://pediatrics.jmir.org/2025/1/e59377" } @Article{info:doi/10.2196/59792, author="He, Rosemary and Sarwal, Varuni and Qiu, Xinru and Zhuang, Yongwen and Zhang, Le and Liu, Yue and Chiang, Jeffrey", title="Generative AI Models in Time-Varying Biomedical Data: Scoping Review", journal="J Med Internet Res", year="2025", month="Mar", day="10", volume="27", pages="e59792", keywords="generative artificial intelligence", keywords="artificial intelligence", keywords="time series", keywords="electronic health records", keywords="electronic medical records", keywords="systematic reviews", keywords="disease trajectory", keywords="machine learning", keywords="algorithms", keywords="forecasting", abstract="Background: Trajectory modeling is a long-standing challenge in the application of computational methods to health care. In the age of big data, traditional statistical and machine learning methods do not achieve satisfactory results as they often fail to capture the complex underlying distributions of multimodal health data and long-term dependencies throughout medical histories. Recent advances in generative artificial intelligence (AI) have provided powerful tools to represent complex distributions and patterns with minimal underlying assumptions, with major impact in fields such as finance and environmental sciences, prompting researchers to apply these methods for disease modeling in health care. Objective: While AI methods have proven powerful, their application in clinical practice remains limited due to their highly complex nature. The proliferation of AI algorithms also poses a significant challenge for nondevelopers to track and incorporate these advances into clinical research and application. In this paper, we introduce basic concepts in generative AI and discuss current algorithms and how they can be applied to health care for practitioners with little background in computer science. Methods: We surveyed peer-reviewed papers on generative AI models with specific applications to time-series health data. Our search included single- and multimodal generative AI models that operated over structured and unstructured data, physiological waveforms, medical imaging, and multi-omics data. We introduce current generative AI methods, review their applications, and discuss their limitations and future directions in each data modality. Results: We followed the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews) guidelines and reviewed 155 articles on generative AI applications to time-series health care data across modalities. Furthermore, we offer a systematic framework for clinicians to easily identify suitable AI methods for their data and task at hand. Conclusions: We reviewed and critiqued existing applications of generative AI to time-series health data with the aim of bridging the gap between computational methods and clinical application. We also identified the shortcomings of existing approaches and highlighted recent advances in generative AI that represent promising directions for health care modeling. ", doi="10.2196/59792", url="https://www.jmir.org/2025/1/e59792" } @Article{info:doi/10.2196/64721, author="Tawfik, Daniel and Rule, Adam and Alexanian, Aram and Cross, Dori and Holmgren, Jay A. and Lou, S. Sunny and McPeek Hinz, Eugenia and Rose, Christian and Viswanadham, N. Ratnalekha V. and Mishuris, G. Rebecca and Rodr{\'i}guez-Fern{\'a}ndez, M. Jorge and Ford, W. Eric and Florig, T. Sarah and Sinsky, A. Christine and Apathy, C. Nate", title="Emerging Domains for Measuring Health Care Delivery With Electronic Health Record Metadata", journal="J Med Internet Res", year="2025", month="Mar", day="6", volume="27", pages="e64721", keywords="metadata", keywords="health services research", keywords="audit logs", keywords="event logs", keywords="electronic health record data", keywords="health care delivery", keywords="patient care", keywords="healthcare teams", keywords="clinician-patient relationship", keywords="cognitive environment", doi="10.2196/64721", url="https://www.jmir.org/2025/1/e64721", url="http://www.ncbi.nlm.nih.gov/pubmed/40053814" } @Article{info:doi/10.2196/56671, author="Oh, Geum Eui and Oh, Sunyoung and Cho, Seunghyeon and Moon, Mir", title="Predicting Readmission Among High-Risk Discharged Patients Using a Machine Learning Model With Nursing Data: Retrospective Study", journal="JMIR Med Inform", year="2025", month="Mar", day="5", volume="13", pages="e56671", keywords="machine learning", keywords="EHR", keywords="electronic health record", keywords="electronic medical record", keywords="EMR", keywords="artificial intelligence", keywords="readmission", keywords="nursing data", keywords="clinical decision support", keywords="prediction", keywords="predictive", keywords="discharge", keywords="admission", keywords="hospitalization", abstract="Background: Unplanned readmissions increase unnecessary health care costs and reduce the quality of care. It is essential to plan the discharge care from the beginning of hospitalization to reduce the risk of readmission. Machine learning--based readmission prediction models can support patients' preemptive discharge care services with improved predictive power. Objective: This study aimed to develop a readmission early prediction model utilizing nursing data for high-risk discharge patients. Methods: This retrospective study included the electronic medical records of 12,977 patients with 1 of the top 6 high-risk readmission diseases at a tertiary hospital in Seoul from January 2018 to January 2020. We used demographic, clinical, and nursing data to construct a prediction model. We constructed unplanned readmission prediction models by dividing them into Model 1 and Model 2. Model 1 used early hospitalization data (up to 1 day after admission), and Model 2 used all the data. To improve the performance of the machine learning method, we performed 5-fold cross-validation and utilized adaptive synthetic sampling to address data imbalance. The 6 algorithms of logistic regression, random forest, decision tree, XGBoost, CatBoost, and multiperceptron layer were employed to develop predictive models. The analysis was conducted using Python Language Reference, version 3.11.3. (Python Software Foundation). Results: In Model 1, among the 6 prediction model algorithms, the random forest model had the best result, with an area under the receiver operating characteristic (AUROC) curve of 0.62. In Model 2, the CatBoost model had the best result, with an AUROC of 0.64. BMI, systolic blood pressure, and age consistently emerged as the most significant predictors of readmission risk across Models 1 and 2. Model 1, which enabled early readmission prediction, showed a higher proportion of nursing data variables among its important predictors compared to Model 2. Conclusions: Machine learning--based readmission prediction models utilizing nursing data provide basic data for evidence-based clinical decision support for high-risk discharge patients with complex conditions and facilitate early intervention. By integrating nursing data containing diverse patient information, these models can provide more comprehensive risk assessment and improve patient outcomes. ", doi="10.2196/56671", url="https://medinform.jmir.org/2025/1/e56671" } @Article{info:doi/10.2196/68863, author="Ohno, Yukiko and Aomori, Tohru and Nishiyama, Tomohiro and Kato, Riri and Fujiki, Reina and Ishikawa, Haruki and Kiyomiya, Keisuke and Isawa, Minae and Mochizuki, Mayumi and Aramaki, Eiji and Ohtani, Hisakazu", title="Performance Improvement of a Natural Language Processing Tool for Extracting Patient Narratives Related to Medical States From Japanese Pharmaceutical Care Records by Increasing the Amount of Training Data: Natural Language Processing Analysis and Validation Study", journal="JMIR Med Inform", year="2025", month="Mar", day="4", volume="13", pages="e68863", keywords="natural language processing", keywords="NLP", keywords="named entity recognition", keywords="NER", keywords="deep learning", keywords="pharmaceutical care record", keywords="electronic medical record", keywords="EMR", keywords="Japanese", abstract="Background: Patients' oral expressions serve as valuable sources of clinical information to improve pharmacotherapy. Natural language processing (NLP) is a useful approach for analyzing unstructured text data, such as patient narratives. However, few studies have focused on using NLP for narratives in the Japanese language. Objective: We aimed to develop a high-performance NLP system for extracting clinical information from patient narratives by examining the performance progression with a gradual increase in the amount of training data. Methods: We used subjective texts from the pharmaceutical care records of Keio University Hospital from April 1, 2018, to March 31, 2019, comprising 12,004 records from 6559 cases. After preprocessing, we annotated diseases and symptoms within the texts. We then trained and evaluated a deep learning model (bidirectional encoder representations from transformers combined with a conditional random field [BERT-CRF]) through 10-fold cross-validation. The annotated data were divided into 10 subsets, and the amount of training data was progressively increased over 10 steps. We also analyzed the causes of errors. Finally, we applied the developed system to the analysis of case report texts to evaluate its usability for texts from other sources. Results: The F1-score of the system improved from 0.67 to 0.82 as the amount of training data increased from 1200 to 12,004 records. The F1-score reached 0.78 with 3600 records and was largely similar thereafter. As performance improved, errors from incorrect extractions decreased significantly, which resulted in an increase in precision. For case reports, the F1-score also increased from 0.34 to 0.41 as the training dataset expanded from 1200 to 12,004 records. Performance was lower for extracting symptoms from case report texts compared with pharmaceutical care records, suggesting that this system is more specialized for analyzing subjective data from pharmaceutical care records. Conclusions: We successfully developed a high-performance system specialized in analyzing subjective data from pharmaceutical care records by training a large dataset, with near-complete saturation of system performance with about 3600 training records. This system will be useful for monitoring symptoms, offering benefits for both clinical practice and research. ", doi="10.2196/68863", url="https://medinform.jmir.org/2025/1/e68863", url="http://www.ncbi.nlm.nih.gov/pubmed/40053805" } @Article{info:doi/10.2196/68135, author="Peasley, Dale and Kuplicki, Rayus and Sen, Sandip and Paulus, Martin", title="Leveraging Large Language Models and Agent-Based Systems for Scientific Data Analysis: Validation Study", journal="JMIR Ment Health", year="2025", month="Feb", day="13", volume="12", pages="e68135", keywords="LLM", keywords="agent-based systems", keywords="scientific data analysis", keywords="data contextualization", keywords="AI-driven research tools", keywords="large language model", keywords="scientific data", keywords="analysis", keywords="contextualization", keywords="AI", keywords="artificial intelligence", keywords="research tool", abstract="Background: Large language models have shown promise in transforming how complex scientific data are analyzed and communicated, yet their application to scientific domains remains challenged by issues of factual accuracy and domain-specific precision. The Laureate Institute for Brain Research--Tulsa University (LIBR-TU) Research Agent (LITURAt) leverages a sophisticated agent-based architecture to mitigate these limitations, using external data retrieval and analysis tools to ensure reliable, context-aware outputs that make scientific information accessible to both experts and nonexperts. Objective: The objective of this study was to develop and evaluate LITURAt to enable efficient analysis and contextualization of complex scientific datasets for diverse user expertise levels. Methods: An agent-based system based on large language models was designed to analyze and contextualize complex scientific datasets using a ``plan-and-solve'' framework. The system dynamically retrieves local data and relevant PubMed literature, performs statistical analyses, and generates comprehensive, context-aware summaries to answer user queries with high accuracy and consistency. Results: Our experiments demonstrated that LITURAt achieved an internal consistency rate of 94.8\% and an external consistency rate of 91.9\% across repeated and rephrased queries. Additionally, GPT-4 evaluations rated 80.3\% (171/213) of the system's answers as accurate and comprehensive, with 23.5\% (50/213) receiving the highest rating of 5 for completeness and precision. Conclusions: These findings highlight the potential of LITURAt to significantly enhance the accessibility and accuracy of scientific data analysis, achieving high consistency and strong performance in complex query resolution. Despite existing limitations, such as model stability for highly variable queries, LITURAt demonstrates promise as a robust tool for democratizing data-driven insights across diverse scientific domains. ", doi="10.2196/68135", url="https://mental.jmir.org/2025/1/e68135" } @Article{info:doi/10.2196/66910, author="Seinen, M. Tom and Kors, A. Jan and van Mulligen, M. Erik and Rijnbeek, R. Peter", title="Using Structured Codes and Free-Text Notes to Measure Information Complementarity in Electronic Health Records: Feasibility and Validation Study", journal="J Med Internet Res", year="2025", month="Feb", day="13", volume="27", pages="e66910", keywords="natural language processing", keywords="named entity recognition", keywords="clinical concept extraction", keywords="machine learning", keywords="electronic health records", keywords="EHR", keywords="word embeddings", keywords="clinical concept similarity", keywords="text mining", keywords="code", keywords="free-text", keywords="information", keywords="electronic record", keywords="data", keywords="patient records", keywords="framework", keywords="structured data", keywords="unstructured data", abstract="Background: Electronic health records (EHRs) consist of both structured data (eg, diagnostic codes) and unstructured data (eg, clinical notes). It is commonly believed that unstructured clinical narratives provide more comprehensive information. However, this assumption lacks large-scale validation and direct validation methods. Objective: This study aims to quantitatively compare the information in structured and unstructured EHR data and directly validate whether unstructured data offers more extensive information across a patient population. Methods: We analyzed both structured and unstructured data from patient records and visits in a large Dutch primary care EHR database between January 2021 and January 2024. Clinical concepts were identified from free-text notes using an extraction framework tailored for Dutch and compared with concepts from structured data. Concept embeddings were generated to measure semantic similarity between structured and extracted concepts through cosine similarity. A similarity threshold was systematically determined via annotated matches and minimized weighted Gini impurity. We then quantified the concept overlap between structured and unstructured data across various concept domains and patient populations. Results: In a population of 1.8 million patients, only 13\% of extracted concepts from patient records and 7\% from individual visits had similar structured counterparts. Conversely, 42\% of structured concepts in records and 25\% in visits had similar matches in unstructured data. Condition concepts had the highest overlap, followed by measurements and drug concepts. Subpopulation visits, such as those with chronic conditions or psychological disorders, showed different proportions of data overlap, indicating varied reliance on structured versus unstructured data across clinical contexts. Conclusions: Our study demonstrates the feasibility of quantifying the information difference between structured and unstructured data, showing that the unstructured data provides important additional information in the studied database and populations. The annotated concept matches are made publicly available for the clinical natural language processing community. Despite some limitations, our proposed methodology proves versatile, and its application can lead to more robust and insightful observational clinical research. ", doi="10.2196/66910", url="https://www.jmir.org/2025/1/e66910" } @Article{info:doi/10.2196/64972, author="Xu, Qian and Cai, Xue and Yu, Ruicong and Zheng, Yueyue and Chen, Guanjie and Sun, Hui and Gao, Tianyun and Xu, Cuirong and Sun, Jing", title="Machine Learning--Based Risk Factor Analysis and Prediction Model Construction for the Occurrence of Chronic Heart Failure: Health Ecologic Study", journal="JMIR Med Inform", year="2025", month="Jan", day="31", volume="13", pages="e64972", keywords="machine learning, chronic heart failure, risk of occurrence", keywords="prediction model, health ecology", abstract="Background: Chronic heart failure (CHF) is a serious threat to human health, with high morbidity and mortality rates, imposing a heavy burden on the health care system and society. With the abundance of medical data and the rapid development of machine learning (ML) technologies, new opportunities are provided for in-depth investigation of the mechanisms of CHF and the construction of predictive models. The introduction of health ecology research methodology enables a comprehensive dissection of CHF risk factors from a wider range of environmental, social, and individual factors. This not only helps to identify high-risk groups at an early stage but also provides a scientific basis for the development of precise prevention and intervention strategies. Objective: This study aims to use ML to construct a predictive model of the risk of occurrence of CHF and analyze the risk of CHF from a health ecology perspective. Methods: This study sourced data from the Jackson Heart Study database. Stringent data preprocessing procedures were implemented, which included meticulous management of missing values and the standardization of data. Principal component analysis and random forest (RF) were used as feature selection techniques. Subsequently, several ML models, namely decision tree, RF, extreme gradient boosting, adaptive boosting (AdaBoost), support vector machine, naive Bayes model, multilayer perceptron, and bootstrap forest, were constructed, and their performance was evaluated. The effectiveness of the models was validated through internal validation using a 10-fold cross-validation approach on the training and validation sets. In addition, the performance metrics of each model, including accuracy, precision, sensitivity, F1-score, and area under the curve (AUC), were compared. After selecting the best model, we used hyperparameter optimization to construct a better model. Results: RF-selected features (21 in total) had an average root mean square error of 0.30, outperforming principal component analysis. Synthetic Minority Oversampling Technique and Edited Nearest Neighbors showed better accuracy in data balancing. The AdaBoost model was most effective with an AUC of 0.86, accuracy of 75.30\%, precision of 0.86, sensitivity of 0.69, and F1-score of 0.76. Validation on the training and validation sets through 10-fold cross-validation gave an AUC of 0.97, an accuracy of 91.27\%, a precision of 0.94, a sensitivity of 0.92, and an F1-score of 0.94. After random search processing, the accuracy and AUC of AdaBoost improved. Its accuracy was 77.68\% and its AUC was 0.86. Conclusions: This study offered insights into CHF risk prediction. Future research should focus on prospective studies, diverse data, advanced techniques, longitudinal studies, and exploring factor interactions for better CHF prevention and management. ", doi="10.2196/64972", url="https://medinform.jmir.org/2025/1/e64972" } @Article{info:doi/10.2196/63109, author="Ghaffar, Faisal and Furtado, M. Nadine and Ali, Imad and Burns, Catherine", title="Diagnostic Decision-Making Variability Between Novice and Expert Optometrists for Glaucoma: Comparative Analysis to Inform AI System Design", journal="JMIR Med Inform", year="2025", month="Jan", day="29", volume="13", pages="e63109", keywords="decision-making", keywords="human-centered AI design", keywords="human factors", keywords="experts versus novices differences", keywords="optometry", keywords="glaucoma diagnosis", keywords="experts versus novices", keywords="glaucoma", keywords="eye disease", keywords="vision", keywords="vision impairment", keywords="comparative analysis", keywords="methodology", keywords="optometrist", keywords="artificial intelligence", keywords="AI", keywords="diagnostic accuracy", keywords="consistency", keywords="clinical data", keywords="risk assessment", keywords="progression analysis", abstract="Background: While expert optometrists tend to rely on a deep understanding of the disease and intuitive pattern recognition, those with less experience may depend more on extensive data, comparisons, and external guidance. Understanding these variations is important for developing artificial intelligence (AI) systems that can effectively support optometrists with varying degrees of experience and minimize decision inconsistencies. Objective: The main objective of this study is to identify and analyze the variations in diagnostic decision-making approaches between novice and expert optometrists. By understanding these variations, we aim to provide guidelines for the development of AI systems that can support optometrists with varying levels of expertise. These guidelines will assist in developing AI systems for glaucoma diagnosis, ultimately enhancing the diagnostic accuracy of optometrists and minimizing inconsistencies in their decisions. Methods: We conducted in-depth interviews with 14 optometrists using within-subject design, including both novices and experts, focusing on their approaches to glaucoma diagnosis. The responses were coded and analyzed using a mixed method approach incorporating both qualitative and quantitative analysis. Statistical tests such as Mann-Whitney U and chi-square tests were used to find significance in intergroup variations. These findings were further supported by themes extracted through qualitative analysis, which helped to identify decision-making patterns and understand variations in their approaches. Results: Both groups showed lower concordance rates with clinical diagnosis, with experts showing almost double (7/35, 20\%) concordance rates with limited data in comparison to novices (7/69, 10\%), highlighting the impact of experience and data availability on clinical judgment; this rate increased to nearly 40\% for both groups (experts: 5/12, 42\% and novices: 8/21, 42\%) when they had access to complete historical data of the patient. We also found statistically significant intergroup differences between the first visits and subsequent visits with a P value of less than .05 on the Mann-Whitney U test in many assessments. Furthermore, approaches to the exam assessment and decision differed significantly: experts emphasized comprehensive risk assessments and progression analysis, demonstrating cognitive efficiency and intuitive decision-making, while novices relied more on structured, analytical methods and external references. Additionally, significant variations in patient follow-up times were observed, with a P value of <.001 on the chi-square test, showing a stronger influence of experience on follow-up time decisions. Conclusions: The study highlights significant variations in the decision-making process of novice and expert optometrists in glaucoma diagnosis, with experience playing a key role in accuracy, approach, and management. These findings demonstrate the critical need for AI systems tailored to varying levels of expertise. They also provide insights for the future design of AI systems aimed at enhancing the diagnostic accuracy of optometrists and consistency across different expertise levels, ultimately improving patient outcomes in optometric practice. ", doi="10.2196/63109", url="https://medinform.jmir.org/2025/1/e63109", url="http://www.ncbi.nlm.nih.gov/pubmed/39879089" } @Article{info:doi/10.2196/53542, author="Demuth, Stanislas and De S{\`e}ze, J{\'e}r{\^o}me and Edan, Gilles and Ziemssen, Tjalf and Simon, Fran{\c{c}}oise and Gourraud, Pierre-Antoine", title="Digital Representation of Patients as Medical Digital Twins: Data-Centric Viewpoint", journal="JMIR Med Inform", year="2025", month="Jan", day="28", volume="13", pages="e53542", keywords="digital twin", keywords="artificial intelligence", keywords="data architecture", keywords="synthetic data", keywords="computational modeling", keywords="precision medicine", keywords="conceptual clarification", keywords="conceptual", keywords="patient", keywords="medicine", keywords="health record", keywords="digital records", keywords="synthetic patient", doi="10.2196/53542", url="https://medinform.jmir.org/2025/1/e53542" } @Article{info:doi/10.2196/59452, author="Willem, Theresa and Wollek, Alessandro and Cheslerean-Boghiu, Theodor and Kenney, Martha and Buyx, Alena", title="The Social Construction of Categorical Data: Mixed Methods Approach to Assessing Data Features in Publicly Available Datasets", journal="JMIR Med Inform", year="2025", month="Jan", day="28", volume="13", pages="e59452", keywords="machine learning", keywords="categorical data", keywords="social context dependency", keywords="mixed methods", keywords="dermatology", keywords="dataset analysis", abstract="Background: In data-sparse areas such as health care, computer scientists aim to leverage as much available information as possible to increase the accuracy of their machine learning models' outputs. As a standard, categorical data, such as patients' gender, socioeconomic status, or skin color, are used to train models in fusion with other data types, such as medical images and text-based medical information. However, the effects of including categorical data features for model training in such data-scarce areas are underexamined, particularly regarding models intended to serve individuals equitably in a diverse population. Objective: This study aimed to explore categorical data's effects on machine learning model outputs, rooted the effects in the data collection and dataset publication processes, and proposed a mixed methods approach to examining datasets' data categories before using them for machine learning training. Methods: Against the theoretical background of the?social construction of categories, we suggest a mixed methods approach to assess categorical data's utility for machine learning model training. As an example, we applied our approach to a Brazilian dermatological dataset (Dermatological and Surgical Assistance Program at the Federal University of Esp{\'i}rito Santo [PAD-UFES] 20). We first present an exploratory, quantitative study that assesses the effects when including or excluding each of the unique categorical data features of the PAD-UFES 20 dataset for training a transformer-based model using a data fusion algorithm. We then pair our quantitative analysis with a qualitative examination of the data categories based on interviews with the dataset authors. Results: Our quantitative study suggests scattered effects of including categorical data for machine learning model training across predictive classes. Our qualitative analysis gives insights into how the categorical data were collected and why they were published, explaining some of the quantitative effects that we observed. Our findings highlight the social constructedness of categorical data in publicly available datasets, meaning that the data in a category heavily depend on both how these categories are defined by the dataset creators and the sociomedico context in which the data are collected. This reveals relevant limitations of using publicly available datasets in contexts different from those of the collection of their data. Conclusions: We caution against using data features of publicly available datasets without reflection on the social construction and context dependency of their categorical data features, particularly in data-sparse areas. We conclude that social scientific, context-dependent analysis of available data features using both quantitative and qualitative methods is helpful in judging the utility of categorical data for the population for which a model is intended. ", doi="10.2196/59452", url="https://medinform.jmir.org/2025/1/e59452" } @Article{info:doi/10.2196/63583, author="Phaswana Mafuya, Nancy Refilwe and Phalane, Edith and Rao, Amrita and Willis, Kalai and Rucinski, Katherine and Voet, Alida K. and Abdulrahman, Amal and Siyamayambo, Claris and Sebati, Betty and Seloka, Mohlago and Jaiteh, Musa and Olifant, Lucia Lerato and Journeay, Katharine and Sisel, Haley and Li, Xiaoming and Olatosi, Bankole and Hikmet, Neset and Duhoon, Prashant and Wolmarans, Francois and Shiferaw, A. Yegnanew and Motsieloa, Lifutso and Rampilo, Mashudu and Baral, Stefan", title="Harnessing Big Heterogeneous Data to Evaluate the Potential Impact of HIV Responses Among Key Populations in Sub-Saharan Africa: Protocol for the Boloka Data Repository Initiative", journal="JMIR Res Protoc", year="2025", month="Jan", day="22", volume="14", pages="e63583", keywords="HIV, key populations", keywords="Sub-Saharan Africa", keywords="big heterogeneous data", keywords="Boloka data repository", abstract="Background: In South Africa, there is no centralized HIV surveillance system where key populations (KPs) data, including gay men and other men who have sex with men, female sex workers, transgender persons, people who use drugs, and incarcerated persons, are stored in South Africa despite being on higher risk of HIV acquisition and transmission than the general population. Data on KPs are being collected on a smaller scale by numerous stakeholders and managed in silos. There exists an opportunity to harness a variety of data, such as empirical, contextual, observational, and programmatic data, for evaluating the potential impact of HIV responses among KPs in South Africa. Objective: This study aimed to leverage and harness big heterogeneous data on HIV among KPs and harmonize and analyze it to inform a targeted HIV response for greater impact in Sub-Saharan Africa. Methods: The Boloka data repository initiative has 5 stages. There will be engagement of a wide range of stakeholders to facilitate the acquisition of data (stage 1). Through these engagements, different data types will be collated (stage 2). The data will be filtered and screened to enable high-quality analyses (stage 3). The collated data will be stored in the Boloka data repository (stage 4). The Boloka data repository will be made accessible to stakeholders and authorized users (stage 5). Results: The protocol was funded by the South African Medical Research Council following external peer reviews (December 2022). The study received initial ethics approval (May 2022), renewal (June 2023), and amendment (July 2024) from the University of Johannesburg (UJ) Research Ethics Committee. The research team has been recruited, onboarded, and received non--web-based internet ethics training (January 2023). A list of current and potential data partners has been compiled (January 2023 to date). Data sharing or user agreements have been signed with several data partners (August 2023 to date). Survey and routine data have been and are being secured (January 5, 2023). In (September 2024) we received Ghana Men Study data. The data transfer agreement between the Pan African Centre for Epidemics Research and the Perinatal HIV Research Unit was finalized (October 2024), and we are anticipating receiving data by (December 2024). In total, 7 abstracts are underway, with 1 abstract completed the analysis and expected to submit the full article to the peer-reviewed journal in early January 2024. As of March 2025, we expect to submit the remaining 6 full articles. Conclusions: A truly ``complete'' data infrastructure that systematically and rigorously integrates diverse data for KPs will not only improve our understanding of local epidemics but will also improve HIV interventions and policies. Furthermore, it will inform future research directions and become an incredible institutional mechanism for epidemiological and public health training in South Africa and Sub-Saharan Africa. International Registered Report Identifier (IRRID): DERR1-10.2196/63583 ", doi="10.2196/63583", url="https://www.researchprotocols.org/2025/1/e63583" } @Article{info:doi/10.2196/54133, author="Yang, Doris and Zhou, Doudou and Cai, Steven and Gan, Ziming and Pencina, Michael and Avillach, Paul and Cai, Tianxi and Hong, Chuan", title="Robust Automated Harmonization of Heterogeneous Data Through Ensemble Machine Learning: Algorithm Development and Validation Study", journal="JMIR Med Inform", year="2025", month="Jan", day="22", volume="13", pages="e54133", keywords="ensemble learning", keywords="semantic learning", keywords="distribution learning", keywords="variable harmonization", keywords="machine learning", keywords="cardiovascular health study", keywords="intracohort comparison", keywords="intercohort comparison", keywords="gold standard labels", abstract="Background: Cohort studies contain rich clinical data across large and diverse patient populations and are a common source of observational data for clinical research. Because large scale cohort studies are both time and resource intensive, one alternative is to harmonize data from existing cohorts through multicohort studies. However, given differences in variable encoding, accurate variable harmonization is difficult. Objective: We propose SONAR (Semantic and Distribution-Based Harmonization) as a method for harmonizing variables across cohort studies to facilitate multicohort studies. Methods: SONAR used semantic learning from variable descriptions and distribution learning from study participant data. Our method learned an embedding vector for each variable and used pairwise cosine similarity to score the similarity between variables. This approach was built off 3 National Institutes of Health cohorts, including the Cardiovascular Health Study, the Multi-Ethnic Study of Atherosclerosis, and the Women's Health Initiative. We also used gold standard labels to further refine the embeddings in a supervised manner. Results: The method was evaluated using manually curated gold standard labels from the 3 National Institutes of Health cohorts. We evaluated both the intracohort and intercohort variable harmonization performance. The supervised SONAR method outperformed existing benchmark methods for almost all intracohort and intercohort comparisons using area under the curve and top-k accuracy metrics. Notably, SONAR was able to significantly improve harmonization of concepts that were difficult for existing semantic methods to harmonize. Conclusions: SONAR achieves accurate variable harmonization within and between cohort studies by harnessing the complementary strengths of semantic learning and variable distribution learning. ", doi="10.2196/54133", url="https://medinform.jmir.org/2025/1/e54133" } @Article{info:doi/10.2196/67878, author="Eysenbach, Gunther", title="Crisis Text Line and Loris.ai Controversy Highlights the Complexity of Informed Consent on the Internet and Data-Sharing Ethics for Machine Learning and Research", journal="J Med Internet Res", year="2025", month="Jan", day="22", volume="27", pages="e67878", keywords="data ethics", keywords="data sharing", keywords="informed consent", keywords="disclosure", keywords="conflict of interest", keywords="transparency", keywords="trust", doi="10.2196/67878", url="https://www.jmir.org/2025/1/e67878" } @Article{info:doi/10.2196/65047, author="Fukushima, Takuya and Manabe, Masae and Yada, Shuntaro and Wakamiya, Shoko and Yoshida, Akiko and Urakawa, Yusaku and Maeda, Akiko and Kan, Shigeyuki and Takahashi, Masayo and Aramaki, Eiji", title="Evaluating and Enhancing Japanese Large Language Models for Genetic Counseling Support: Comparative Study of Domain Adaptation and the Development of an Expert-Evaluated Dataset", journal="JMIR Med Inform", year="2025", month="Jan", day="16", volume="13", pages="e65047", keywords="large language models", keywords="genetic counseling", keywords="medical", keywords="health", keywords="artificial intelligence", keywords="machine learning", keywords="domain adaptation", keywords="retrieval-augmented generation", keywords="instruction tuning", keywords="prompt engineering", keywords="question-answer", keywords="dialogue", keywords="ethics", keywords="safety", keywords="low-rank adaptation", keywords="Japanese", keywords="expert evaluation", abstract="Background: Advances in genetics have underscored a strong association between genetic factors and health outcomes, leading to an increased demand for genetic counseling services. However, a shortage of qualified genetic counselors poses a significant challenge. Large language models (LLMs) have emerged as a potential solution for augmenting support in genetic counseling tasks. Despite the potential, Japanese genetic counseling LLMs (JGCLLMs) are underexplored. To advance a JGCLLM-based dialogue system for genetic counseling, effective domain adaptation methods require investigation. Objective: This study aims to evaluate the current capabilities and identify challenges in developing a JGCLLM-based dialogue system for genetic counseling. The primary focus is to assess the effectiveness of prompt engineering, retrieval-augmented generation (RAG), and instruction tuning within the context of genetic counseling. Furthermore, we will establish an experts-evaluated dataset of responses generated by LLMs adapted to Japanese genetic counseling for the future development of JGCLLMs. Methods: Two primary datasets were used in this study: (1) a question-answer (QA) dataset for LLM adaptation and (2) a genetic counseling question dataset for evaluation. The QA dataset included 899 QA pairs covering medical and genetic counseling topics, while the evaluation dataset contained 120 curated questions across 6 genetic counseling categories. Three enhancement techniques of LLMs---instruction tuning, RAG, and prompt engineering---were applied to a lightweight Japanese LLM to enhance its ability for genetic counseling. The performance of the adapted LLM was evaluated on the 120-question dataset by 2 certified genetic counselors and 1 ophthalmologist (SK, YU, and AY). Evaluation focused on four metrics: (1) inappropriateness of information, (2) sufficiency of information, (3) severity of harm, and (4) alignment with medical consensus. Results: The evaluation by certified genetic counselors and an ophthalmologist revealed varied outcomes across different methods. RAG showed potential, particularly in enhancing critical aspects of genetic counseling. In contrast, instruction tuning and prompt engineering produced less favorable outcomes. This evaluation process facilitated the creation an expert-evaluated dataset of responses generated by LLMs adapted with different combinations of these methods. Error analysis identified key ethical concerns, including inappropriate promotion of prenatal testing, criticism of relatives, and inaccurate probability statements. Conclusions: RAG demonstrated notable improvements across all evaluation metrics, suggesting potential for further enhancement through the expansion of RAG data. The expert-evaluated dataset developed in this study provides valuable insights for future optimization efforts. However, the ethical issues observed in JGCLLM responses underscore the critical need for ongoing refinement and thorough ethical evaluation before these systems can be implemented in health care settings. ", doi="10.2196/65047", url="https://medinform.jmir.org/2025/1/e65047" } @Article{info:doi/10.2196/62645, author="Wu, Mengqiu and Xue, Yongxi and Ma, Chengyu", title="The Association Between the Digital Divide and Health Inequalities Among Older Adults in China: Nationally Representative Cross-Sectional Survey", journal="J Med Internet Res", year="2025", month="Jan", day="15", volume="27", pages="e62645", keywords="older adults", keywords="digital divide", keywords="internet use", keywords="internet access", keywords="health inequalities", abstract="Background: Health inequalities among older adults become increasingly pronounced as aging progresses. In the digital era, some researchers argue that access to and use of digital technologies may contribute to or exacerbate these existing health inequalities. Conversely, other researchers believe that digital technologies can help mitigate these disparities. Objective: This study aimed to investigate the relationship between the digital divide and health inequality among older adults and to offer recommendations for promoting health equity. Methods: Data were obtained from the 2018 and 2020 waves of the China Health and Retirement Longitudinal Study. Physical, mental, and subjective health were assessed using the Activities of Daily Living (ADL) scale, the Instrumental Activities of Daily Living scale, the Mini-Mental State Examination scale, and a 5-point self-rated health scale, respectively. The chi-square and rank sum tests were used to explore whether internet use and access were associated with health inequality status. After controlling for confounders, multiple linear regression models were used to further determine this association. Sensitivity analysis was conducted using propensity score matching, and heterogeneity was analyzed for different influencing factors. Results: The 2018 analysis highlighted widening health disparities among older adults due to internet access and use, with statistically significant increases in inequalities in self-rated health (3.9\%), ADL score (5.8\%), and cognition (7.5\%). Similarly, internet use widened gaps in self-rated health (7.5\%) and cognition (7.6\%). Conversely, the 2020 analysis demonstrated that internet access improved health disparities among older adults, reducing gaps in self-rated health (3.8\%), ADL score (2.1\%), instrumental ADL score (3.5\%), and cognition (7.5\%), with significant results, except for ADL. Internet use also narrowed disparities, with significant effects on self-rated health (4.8\%) and cognition (12.8\%). The robustness of the results was confirmed through propensity score--matching paired tests. In addition, the study found heterogeneity in the effects of internet access and use on health inequalities among older adults, depending on sex, age, education, and region. Conclusions: The impact of internet access and use on health inequalities among older adults showed different trends in 2018 and 2020. These findings underscore the importance of addressing the challenges and barriers to internet use among older adults, particularly during the early stages of digital adoption. It is recommended to promote equitable access to the health benefits of the internet through policy interventions, social support, and technological advancements. ", doi="10.2196/62645", url="https://www.jmir.org/2025/1/e62645" } @Article{info:doi/10.2196/59937, author="Kaur, Harleen and Tripathi, Stuti and Chalga, Singh Manjeet and Benara, K. Sudhir and Dhiman, Amit and Gupta, Shefali and Nair, Saritha and Menon, Geetha and Gulati, K. B. and Sharma, Sandeep and Sharma, Saurabh", title="Unified Mobile App for Streamlining Verbal Autopsy and Cause of Death Assignment in India: Design and Development Study", journal="JMIR Form Res", year="2025", month="Jan", day="10", volume="9", pages="e59937", keywords="verbal autopsy", keywords="cause of death", keywords="mortality", keywords="mHealth", keywords="public health", keywords="India", keywords="mobile health", abstract="Background: Verbal autopsy (VA) has been a crucial tool in ascertaining population-level cause of death (COD) estimates, specifically in countries where medical certification of COD is relatively limited. The World Health Organization has released an updated instrument (Verbal Autopsy Instrument 2022) that supports electronic data collection methods along with analytical software for assigning COD. This questionnaire encompasses the primary signs and symptoms associated with prevalent diseases across all age groups. Traditional methods have primarily involved paper-based questionnaires and physician-coded approaches for COD assignment, which is time-consuming and resource-intensive. Although computer-coded algorithms have advanced the COD assignment process, data collection in densely populated countries like India remains a logistical challenge. Objective: This study aimed to develop an Android-based mobile app specifically tailored for streamlining VA data collection by leveraging the existing Indian public health workforce. The app has been designed to integrate real-time data collection by frontline health workers and seamless data transmission and digital reporting of COD by physicians. This process aimed to enhance the efficiency and accuracy of COD assignment through VA. Methods: The app was developed using Android Studio, the primary integrated development environment for developing Android apps using Java. The front-end interface was developed using XML, while SQLite and MySQL were employed to streamline complete data storage on the local and server databases, respectively. The communication between the app and the server was facilitated through a PHP application programming interface to synchronize data from the local to the server database. The complete prototype was specifically built to reduce manual intervention and automate VA data collection. Results: The app was developed to align with the current Indian public health system for district-level COD estimation. By leveraging this mobile app, the average duration required for VA data collection to ascertainment of COD, which typically ranges from 6 to 8 months, is expected to decrease by approximately 80\%, reducing it to about 1?2 months. Based on annual caseload projections, the smallest administrative public health unit, health and wellness centers, is anticipated to handle 35?40 VA cases annually, while medical officers at primary health centers are projected to manage 150?200 physician-certified VAs each year. The app's data collection and transmission efficiency were further improved based on feedback from user and subject area experts. Conclusions: The development of a unified mobile app could streamline the VA process, enabling the generation of accurate national and subnational COD estimates. This mobile app can be further piloted and scaled to different regions to integrate the automated VA model into the existing public health system for generating comprehensive mortality statistics in India. ", doi="10.2196/59937", url="https://formative.jmir.org/2025/1/e59937" } @Article{info:doi/10.2196/57263, author="Kallout, Julien and Lamer, Antoine and Grosjean, Julien and Kerdelhu{\'e}, Ga{\'e}tan and Bouzill{\'e}, Guillaume and Clavier, Thomas and Popoff, Benjamin", title="Contribution of Open Access Databases to Intensive Care Medicine Research: Scoping Review", journal="J Med Internet Res", year="2025", month="Jan", day="9", volume="27", pages="e57263", keywords="intensive care unit", keywords="ICU", keywords="big data", keywords="databases", keywords="open access", keywords="Amsterdam University Medical Centers Database", keywords="AmsterdamUMCdb", keywords="eICU Collaborative Research Database", keywords="eICU-CRD", keywords="database", keywords="screening", keywords="descriptive analysis", abstract="Background: Intensive care units (ICUs) handle the most critical patients with a high risk of mortality. Due to those conditions, close monitoring is necessary and therefore, a large volume of data is collected. Collaborative ventures have enabled the emergence of large open access databases, leading to numerous publications in the field. Objective: The aim of this scoping review is to identify the characteristics of studies using open access intensive care databases and to describe the contribution of these studies to intensive care research. Methods: The research was conducted using 3 databases (PubMed--MEDLINE, Embase, and Web of Science) from the inception of each database to August 1, 2022. We included original articles based on 4 open databases of patients admitted to ICUs: Amsterdam University Medical Centers Database, eICU Collaborative Research Database, High time resolution ICU dataset, Medical Information Mart for Intensive Care (II to IV). A double-blinded screening for eligibility was performed, first on the title and abstract and subsequently on the full-text articles. Characteristics relating to publication journals, study design, and statistical analyses were extracted and analyzed. Results: We observed a consistent increase in the number of publications from these databases since 2016. The Medical Information Mart for Intensive Care databases were the most frequently used. The highest contributions came from China and the United States, with 689 (52.7\%) and 370 (28.3\%) publications respectively. The median impact factor of publications was 3.8 (IQR 2.8-5.8). Topics related to cardiovascular and infectious diseases were predominant, accounting for 333 (25.5\%) and 324 (24.8\%) articles, respectively. Logistic regression emerged as the most commonly used statistical model for both inference and prediction questions, featuring in 396 (55.5\%) and 281 (47.5\%) studies, respectively. A majority of the inference studies yielded statistically significant results (84.0\%). In prediction studies, area under the curve was the most frequent performance measure, with a median value of 0.840 (IQR 0.780-0.890). Conclusions: The abundance of scientific outputs resulting from these databases, coupled with the diversity of topics addressed, highlight the importance of these databases as valuable resources for clinical research. This suggests their potential impact on clinical practice within intensive care settings. However, the quality and clinical relevance of these studies remains highly heterogeneous, with a majority of articles being published in low--impact factor journals. ", doi="10.2196/57263", url="https://www.jmir.org/2025/1/e57263" } @Article{info:doi/10.2196/58630, author="Malburg, M. Carly and Gutreuter, Steve and Ruise{\~n}or-Escudero, Horacio and Abdul-Quader, Abu and Hladik, Wolfgang", title="Population Size Estimation of Men Who Have Sex With Men in Low- and Middle-Income Countries: Google Trends Analysis", journal="JMIR Public Health Surveill", year="2025", month="Jan", day="9", volume="11", pages="e58630", keywords="population size estimation", keywords="men who have sex with men", keywords="MSM", keywords="PSE", keywords="google trends", keywords="HIV", keywords="AIDS", keywords="programming and policy", keywords="internet", keywords="porn", keywords="gay porn", keywords="male adult", keywords="geriatric", keywords="linear regression", keywords="homosexuality", keywords="sensitivity analysis", keywords="World Health Organization", keywords="WHO", keywords="epidemiology", abstract="Background: Population size estimation (PSE) for key populations is needed to inform HIV programming and policy. Objective: This study aimed to examine the utility of applying a recently proposed method using Google Trend (GT) internet search data to generate PSE (Google Trends Population Size Estimate [GTPSE]) for men who have sex with men (MSM) in 54 countries in Africa, Asia, the Americas, and Europe. Methods: We examined GT relative search volumes (representing the relative internet search frequency of specific search terms) for ``porn'' and, as a comparator term, ``gay porn'' for the year 2020. We assumed ``porn'' represents ``men'' (denominator) while ``gay porn'' represents a subset of ``MSM'' (numerator) in each county, resulting in a proportional size estimate for MSM. We multiplied the proportional GTPSE values with the countries' male adult population (15?49 years) to obtain absolute size estimates. Separately, we produced subnational MSM PSE limited to countries' (commercial) capitals. Using linear regression analysis, we examined the effect of countries' levels of urbanization, internet penetration, criminalization of homosexuality, and stigma on national GTPSE results. We conducted a sensitivity analysis in a subset of countries (n=14) examining the effect of alternative English search terms, different language search terms (Spanish, French, and Swahili), and alternative search years (2019 and 2021). Results: One country was excluded from our analysis as no GT data could be obtained. Of the remaining 53 countries, all national GTPSE values exceeded the World Health Organization's recommended minimum PSE threshold of 1\% (range 1.2\%?7.5\%). For 44 out of 49 (89.8\%) of the countries, GTPSE results were higher than Joint United Nations Programme on HIV/AIDS (UNAIDS) Key Population Atlas values but largely consistent with the regional UNAIDS Global AIDS Monitoring results. Substantial heterogeneity across same-region countries was evident in GTPSE although smaller than those based on Key Population Atlas data. Subnational GTPSE values were obtained in 51 out of 53 (96\%) countries; all subnational GTPSE values exceeded 1\% but often did not match or exceed the corresponding countries' national estimates. None of the covariates examined had a substantial effect on the GTPSE values (R2 values 0.01?0.28). Alternative (English) search terms in 12 out of 14 (85\%) countries produced GTPSE>1\%. Using non-English language terms often produced markedly lower same-country GTPSE values compared with English with 10 out of 14 (71\%) countries showing national GTPSE exceeding 1\%. GTPSE used search data from 2019 and 2021, yielding results similar to those of the reference year 2020. Due to a lack of absolute search volume data, credibility intervals could not be computed. The validity of key assumptions, especially who (males and females) searches for porn and gay porn, could not be assessed. Conclusions: GTPSE for MSM provides a simple, fast, essentially cost-free method. Limitations that impact the certainty of our estimates include a lack of validation of key assumptions and an inability to assign credibility intervals. GTPSE for MSM may provide an additional data source, especially for estimating national-level PSE. ", doi="10.2196/58630", url="https://publichealth.jmir.org/2025/1/e58630" } @Article{info:doi/10.2196/67272, author="Lee, Heui Yoon and Choi, Hanna and Lee, Soo-Kyoung", title="Development of Personas and Journey Maps for Artificial Intelligence Agents Supporting the Use of Health Big Data: Human-Centered Design Approach", journal="JMIR Form Res", year="2025", month="Jan", day="8", volume="9", pages="e67272", keywords="analysis", keywords="health big data", keywords="human-centered design", keywords="persona", keywords="user journey map", keywords="artificial intelligence", keywords="human-AI", keywords="interviews", keywords="users' experiences", abstract="Background: The rapid proliferation of artificial intelligence (AI) requires new approaches for human-AI interfaces that are different from classic human-computer interfaces. In developing a system that is conducive to the analysis and use of health big data (HBD), reflecting the empirical characteristics of users who have performed HBD analysis is the most crucial aspect to consider. Recently, human-centered design methodology, a field of user-centered design, has been expanded and is used not only to develop types of products but also technologies and services. Objective: This study was conducted to integrate and analyze users' experiences along the HBD analysis journey using the human-centered design methodology and reflect them in the development of AI agents that support future HBD analysis. This research aims to help accelerate the development of novel human-AI interfaces for AI agents that support the analysis and use of HBD, which will be urgently needed in the near future. Methods: Using human-centered design methodology, we collected data through shadowing and in-depth interviews with 16 people with experience in analyzing and using HBD. We identified users' empirical characteristics, emotions, pain points, and needs related to HBD analysis and use and created personas and journey maps. Results: The general characteristics of participants (n=16) were as follows: the majority were in their 40s (n=6, 38\%) and held a PhD degree (n=10, 63\%). Professors (n=7, 44\%) and health care personnel (n=10, 63\%) represented the largest professional groups. Participants' experiences with big data analysis varied, with 25\% (n=4) being beginners and 38\% (n=6) having extensive experience. Common analysis methods included statistical analysis (n=7, 44\%) and data mining (n=6, 38\%). Qualitative findings from shadowing and in-depth interviews revealed key challenges: lack of knowledge on using analytical solutions, crisis management difficulties during errors, and inadequate understanding of health care data and clinical decision-making, especially among non--health care professionals. Three types of personas and journey maps---health care professionals as big data analysis beginners, health care professionals who have experience in big data analytics, and non--health care professionals who are experts in big data analytics---were derived. They showed a need for personalized platforms tailored to the user level, appropriate direction through a navigation function, a crisis management support system, communication and sharing among users, and expert linkage service. Conclusions: The knowledge obtained from this study can be leveraged in designing an AI agent to support future HBD analysis and use. This is expected to further increase the usability of HBD by helping users perform effective use of HBD more easily. ", doi="10.2196/67272", url="https://formative.jmir.org/2025/1/e67272" } @Article{info:doi/10.2196/49567, author="Luo, Waylon and Jin, Ruoming and Kenne, Deric and Phan, NhatHai and Tang, Tang", title="An Analysis of the Prevalence and Trends in Drug-Related Lyrics on Twitter (X): Quantitative Approach", journal="JMIR Form Res", year="2024", month="Dec", day="30", volume="8", pages="e49567", keywords="Twitter (X)", keywords="popular music", keywords="big data analysis", keywords="music", keywords="lyrics", keywords="big data", keywords="substance abuse", keywords="tweet", keywords="social media", keywords="drug", keywords="alcohol", abstract="Background: The pervasiveness of drug culture has become evident in popular music and social media. Previous research has examined drug abuse content in both social media and popular music; however, to our knowledge, the intersection of drug abuse content in these 2 domains has not been explored. To address the ongoing drug epidemic, we analyzed drug-related content on Twitter (subsequently rebranded X), with a specific focus on lyrics. Our study provides a novel finding on the prevalence of drug abuse by defining a new subcategory of X content: ``tweets that reference established drug lyrics.'' Objective: We aim to investigate drug trends in popular music on X, identify and classify popular drugs, and analyze related artists' gender, genre, and popularity. Based on the collected data, our goal is to create a prediction model for future drug trends and gain a deeper understanding of the characteristics of users who cite drug lyrics on X. Methods: X data were collected from 2015 to 2017 through the X streaming application programming interface (API). Drug lyrics were obtained from the Genius lyrics database using the Genius API based on drug keywords. The Smith-Waterman text-matching algorithm is used to detect the drug lyrics in posts. We identified famous drugs in lyrics that were posted. Consequently, the analysis was extended to related artists, songs, genres, and popularity on X. The frequency of drug-related lyrics on X was aggregated into a time-series, which was then used to create prediction models using linear regression, Facebook Prophet, and NIXTLA TimeGPT-1. In addition, we analyzed the number of followers of users posting drug-related lyrics to explore user characteristics. Results: We analyzed over 1.97 billion publicly available posts from 2015 to 2017, identifying more than 157 million that matched drug-related keywords. Of these, 150,746 posts referenced drug-related lyrics. Cannabinoids, opioids, stimulants, and hallucinogens were the most cited drugs in lyrics on X. Rap and hip-hop dominated, with 91.98\% of drug-related lyrics from these genres and 84.21\% performed by male artists. Predictions from all 3 models, linear regression, Facebook Prophet, and NIXTLA TimeGPT-1, indicate a slight decline in the prevalence of drug-related lyrics on X over time. Conclusions: Our study revealed 2 significant findings. First, we identified a previously unexamined subset of drug-related content on X: drug lyrics, which could play a critical role in models predicting the surge in drug-related incidents. Second, we demonstrated the use of cutting-edge time-series forecasting tools, including Facebook Prophet and NIXTLA TimeGPT-1, in accurately predicting these trends. These insights contribute to our understanding of how social media shapes public behavior and sentiment toward drug use. ", doi="10.2196/49567", url="https://formative.jmir.org/2024/1/e49567" } @Article{info:doi/10.2196/60017, author="Knight, Jo and Chandrabalan, Vardhan Vishnu and Emsley, A. Hedley C.", title="Visualizing Patient Pathways and Identifying Data Repositories in a UK Neurosciences Center: Exploratory Study", journal="JMIR Med Inform", year="2024", month="Dec", day="24", volume="12", pages="e60017", keywords="health data", keywords="business process monitoring notation", keywords="neurology", keywords="process monitoring", keywords="patient pathway", keywords="clinical pathway", keywords="patient care", keywords="EHR", keywords="electronic health record", keywords="dataset", keywords="questionnaire", keywords="patient data", keywords="NHS", keywords="National Health Service", abstract="Background: Health and clinical activity data are a vital resource for research, improving patient care and service efficiency. Health care data are inherently complex, and their acquisition, storage, retrieval, and subsequent analysis require a thorough understanding of the clinical pathways underpinning such data. Better use of health care data could lead to improvements in patient care and service delivery. However, this depends on the identification of relevant datasets. Objective: We aimed to demonstrate the application of business process modeling notation (BPMN) to represent clinical pathways at a UK neurosciences center and map the clinical activity to corresponding data flows into electronic health records and other nonstandard data repositories. Methods: We used BPMN to map and visualize a patient journey and the subsequent movement and storage of patient data. After identifying several datasets that were being held outside of the standard applications, we collected information about these datasets using a questionnaire. Results: We identified 13 standard applications where neurology clinical activity was captured as part of the patient's electronic health record including applications and databases for managing referrals, outpatient activity, laboratory data, imaging data, and clinic letters. We also identified 22 distinct datasets not within standard applications that were created and managed within the neurosciences department, either by individuals or teams. These were being used to deliver direct patient care and included datasets for tracking patient blood results, recording home visits, and tracking triage status. Conclusions: Mapping patient data flows and repositories allowed us to identify areas wherein the current electronic health record does not fulfill the needs of day-to-day patient care. Data that are being stored outside of standard applications represent a potential duplication in the effort and risks being overlooked. Future work should identify unmet data needs to inform correct data capture and centralization within appropriate data architectures. ", doi="10.2196/60017", url="https://medinform.jmir.org/2024/1/e60017" } @Article{info:doi/10.2196/59113, author="de Groot, Rowdy and van der Graaff, Frank and van der Doelen, Dani{\"e}l and Luijten, Michiel and De Meyer, Ronald and Alrouh, Hekmat and van Oers, Hedy and Tieskens, Jacintha and Zijlmans, Josjan and Bartels, Meike and Popma, Arne and de Keizer, Nicolette and Cornet, Ronald and Polderman, C. Tinca J.", title="Implementing Findable, Accessible, Interoperable, Reusable (FAIR) Principles in Child and Adolescent Mental Health Research: Mixed Methods Approach", journal="JMIR Ment Health", year="2024", month="Dec", day="19", volume="11", pages="e59113", keywords="FAIR data", keywords="research data management", keywords="data interoperability", keywords="data standardization", keywords="OMOP CDM", keywords="implementation", keywords="health data", keywords="data quality", keywords="FAIR principles", abstract="Background: The FAIR (Findable, Accessible, Interoperable, Reusable) data principles are a guideline to improve the reusability of data. However, properly implementing these principles is challenging due to a wide range of barriers. Objectives: To further the field of FAIR data, this study aimed to systematically identify barriers regarding implementing the FAIR principles in the area of child and adolescent mental health research, define the most challenging barriers, and provide recommendations for these barriers. Methods: Three sources were used as input to identify barriers: (1) evaluation of the implementation process of the Observational Medical Outcomes Partnership Common Data Model by 3 data managers; (2) interviews with experts on mental health research, reusable health data, and data quality; and (3) a rapid literature review. All barriers were categorized according to type as described previously, the affected FAIR principle, a category to add detail about the origin of the barrier, and whether a barrier was mental health specific. The barriers were assessed and ranked on impact with the data managers using the Delphi method. Results: Thirteen barriers were identified by the data managers, 7 were identified by the experts, and 30 barriers were extracted from the literature. This resulted in 45 unique barriers. The characteristics that were most assigned to the barriers were, respectively, external type (n=32/45; eg, organizational policy preventing the use of required software), tooling category (n=19/45; ie, software and databases), all FAIR principles (n=15/45), and not mental health specific (n=43/45). Consensus on ranking the scores of the barriers was reached after 2 rounds of the Delphi method. The most important recommendations to overcome the barriers are adding a FAIR data steward to the research team, accessible step-by-step guides, and ensuring sustainable funding for the implementation and long-term use of FAIR data. Conclusions: By systematically listing these barriers and providing recommendations, we intend to enhance the awareness of researchers and grant providers that making data FAIR demands specific expertise, available tooling, and proper investments. ", doi="10.2196/59113", url="https://mental.jmir.org/2024/1/e59113" } @Article{info:doi/10.2196/60231, author="Silvey, Scott and Liu, Jinze", title="Sample Size Requirements for Popular Classification Algorithms in Tabular Clinical Data: Empirical Study", journal="J Med Internet Res", year="2024", month="Dec", day="17", volume="26", pages="e60231", keywords="medical informatics", keywords="machine learning", keywords="sample size", keywords="research design", keywords="decision trees", keywords="classification algorithm", keywords="clinical research", keywords="learning-curve analysis", keywords="analysis", keywords="analyses", keywords="guidelines", keywords="ML", keywords="decision making", keywords="algorithm", keywords="curve analysis", keywords="dataset", abstract="Background: The performance of a classification algorithm eventually reaches a point of diminishing returns, where the additional sample added does not improve the results. Thus, there is a need to determine an optimal sample size that maximizes performance while accounting for computational burden or budgetary concerns. Objective: This study aimed to determine optimal sample sizes and the relationships between sample size and dataset-level characteristics over a variety of binary classification algorithms. Methods: A total of 16 large open-source datasets were collected, each containing a binary clinical outcome. Furthermore, 4 machine learning algorithms were assessed: XGBoost (XGB), random forest (RF), logistic regression (LR), and neural networks (NNs). For each dataset, the cross-validated area under the curve (AUC) was calculated at increasing sample sizes, and learning curves were fit. Sample sizes needed to reach the observed full--dataset AUC minus 2 points (0.02) were calculated from the fitted learning curves and compared across the datasets and algorithms. Dataset--level characteristics, minority class proportion, full--dataset AUC, number of features, type of features, and degree of nonlinearity were examined. Negative binomial regression models were used to quantify relationships between these characteristics and expected sample sizes within each algorithm. A total of 4 multivariable models were constructed, which selected the best-fitting combination of dataset--level characteristics. Results: Among the 16 datasets (full-dataset sample sizes ranging from 70,000-1,000,000), median sample sizes were 9960 (XGB), 3404 (RF), 696 (LR), and 12,298 (NN) to reach AUC stability. For all 4 algorithms, more balanced classes (multiplier: 0.93-0.96 for a 1\% increase in minority class proportion) were associated with decreased sample size. Other characteristics varied in importance across algorithms---in general, more features, weaker features, and more complex relationships between the predictors and the response increased expected sample sizes. In multivariable analysis, the top selected predictors were minority class proportion among all 4 algorithms assessed, full--dataset AUC (XGB, RF, and NN), and dataset nonlinearity (XGB, RF, and NN). For LR, the top predictors were minority class proportion, percentage of strong linear features, and number of features. Final multivariable sample size models had high goodness-of-fit, with dataset--level predictors explaining a majority (66.5\%-84.5\%) of the total deviance in the data among all 4 models. Conclusions: The sample sizes needed to reach AUC stability among 4 popular classification algorithms vary by dataset and method and are associated with dataset--level characteristics that can be influenced or estimated before the start of a research study. ", doi="10.2196/60231", url="https://www.jmir.org/2024/1/e60231" } @Article{info:doi/10.2196/64362, author="Dahu, M. Butros and Khan, Solaiman and Toubal, Eddine Imad and Alshehri, Mariam and Martinez-Villar, I. Carlos and Ogundele, B. Olabode and Sheets, R. Lincoln and Scott, J. Grant", title="Geospatial Modeling of Deep Neural Visual Features for Predicting Obesity Prevalence in Missouri: Quantitative Study", journal="JMIR AI", year="2024", month="Dec", day="17", volume="3", pages="e64362", keywords="geospatial modeling", keywords="deep convolutional neural network", keywords="DCNN", keywords="Residual Network-50", keywords="ResNet-50", keywords="satellite imagery", keywords="Moran I", keywords="local indicators of spatial association", keywords="LISA", keywords="spatial lag model", keywords="obesity rate", keywords="artificial intelligence", keywords="AI", abstract="Background: The global obesity epidemic demands innovative approaches to understand its complex environmental and social determinants. Spatial technologies, such as geographic information systems, remote sensing, and spatial machine learning, offer new insights into this health issue. This study uses deep learning and spatial modeling to predict obesity rates for census tracts in Missouri. Objective: This study aims to develop a scalable method for predicting obesity prevalence using deep convolutional neural networks applied to satellite imagery and geospatial analysis, focusing on 1052 census tracts in Missouri. Methods: Our analysis followed 3 steps. First, Sentinel-2 satellite images were processed using the Residual Network-50 model to extract environmental features from 63,592 image chips (224{\texttimes}224 pixels). Second, these features were merged with obesity rate data from the Centers for Disease Control and Prevention for Missouri census tracts. Third, a spatial lag model was used to predict obesity rates and analyze the association between deep neural visual features and obesity prevalence. Spatial autocorrelation was used to identify clusters of obesity rates. Results: Substantial spatial clustering of obesity rates was found across Missouri, with a Moran I value of 0.68, indicating similar obesity rates among neighboring census tracts. The spatial lag model demonstrated strong predictive performance, with an R2 of 0.93 and a spatial pseudo R2 of 0.92, explaining 93\% of the variation in obesity rates. Local indicators from a spatial association analysis revealed regions with distinct high and low clusters of obesity, which were visualized through choropleth maps. Conclusions: This study highlights the effectiveness of integrating deep convolutional neural networks and spatial modeling to predict obesity prevalence based on environmental features from satellite imagery. The model's high accuracy and ability to capture spatial patterns offer valuable insights for public health interventions. Future work should expand the geographical scope and include socioeconomic data to further refine the model for broader applications in obesity research. ", doi="10.2196/64362", url="https://ai.jmir.org/2024/1/e64362", url="http://www.ncbi.nlm.nih.gov/pubmed/39688897" } @Article{info:doi/10.2196/67928, author="Jung, Sun-Young and Lee, Ji-Hyeon", title="Emotional Touch Nursing Competencies Model of the Fourth Industrial Revolution: Instrument Validation Study", journal="Asian Pac Isl Nurs J", year="2024", month="Dec", day="16", volume="8", pages="e67928", keywords="nurse", keywords="therapeutic touch", keywords="clinical competence", keywords="factor analysis", keywords="statistical", keywords="reliability", keywords="scale", keywords="tool", keywords="nursing", keywords="industrial revolution", keywords="competencies", keywords="health care", keywords="emotional", keywords="interview", keywords="collaborative practice", keywords="learning agility", keywords="professional commitment", keywords="positive self-worth", keywords="compliance", keywords="ethics", keywords="practice ability", keywords="relationship ability", keywords="nursing sensitivity", abstract="Background: The Fourth Industrial Revolution is transforming the health care sector through advanced technologies such as artificial intelligence, the Internet of Things, and big data, leading to new expectations for rapid and accurate treatment. While the integration of technology in nursing tasks is on the rise, there remains a critical need to balance technological efficiency with empathy and emotional connection. This study aims to develop and validate a competency model for emotional touch nursing that responds to the evolving demands of the changing health care environment. Objective: The aims of our study are to develop an emotional touch nursing competencies model and to verify its reliability and validity. Methods: A conceptual framework and construct factors were developed based on an extensive literature review and in-depth interviews with nurses. The potential competencies were confirmed by 20 experts, and preliminary questions were prepared. The final version of the scale was verified through exploratory factor analysis (n=255) and confirmatory factor analysis (n=256) to assess its validity and reliability. Results: From the exploratory analysis, 8 factors and 38 items (client-centered collaborative practice, learning agility for nursing, nursing professional commitment, positive self-worth, compliance with ethics and roles, nursing practice competence, nurse-client relationship, and nursing sensitivity) were extracted. These items were verified through convergent and discriminant validity testing. The internal consistency reliability was acceptable (Cronbach $\alpha$=0.95). Conclusions: The findings from this study confirmed that this scale has sufficient validity and reliability to measure emotional touch nursing competencies. It is expected to be used to build a knowledge and educational system for emotional touch nursing. ", doi="10.2196/67928", url="https://apinj.jmir.org/2024/1/e67928" } @Article{info:doi/10.2196/63795, author="Shimoo, Satoshi and Senoo, Keitaro and Okawa, Taku and Kawai, Kohei and Makino, Masahiro and Munakata, Jun and Tomura, Nobunari and Iwakoshi, Hibiki and Nishimura, Tetsuro and Shiraishi, Hirokazu and Inoue, Keiji and Matoba, Satoaki", title="Using Machine Learning to Predict the Duration of Atrial Fibrillation: Model Development and Validation", journal="JMIR Med Inform", year="2024", month="Nov", day="22", volume="12", pages="e63795", keywords="persistent atrial fibrillation", keywords="atrial fibrillation duration", keywords="12-lead electrocardiogram", keywords="machine learning", keywords="support system", abstract="Background: Atrial fibrillation (AF) is a progressive disease, and its clinical type is classified according to the AF duration: paroxysmal AF, persistent AF (PeAF; AF duration of less than 1 year), and long-standing persistent AF (AF duration of more than 1 year). When considering the indication for catheter ablation, having a long AF duration is considered a risk factor for recurrence, and therefore, the duration of AF is an important factor in determining the treatment strategy for PeAF. Objective: This study aims to improve the accuracy of the cardiologists' diagnosis of the AF duration, and the steps to achieve this goal are to develop a predictive model of the AF duration and validate the support performance of the prediction model. Methods: The study included 272 patients with PeAF (aged 20-90 years), with data obtained between January 1, 2015, and December 31, 2023. Of those, 189 (69.5\%) were included in the study, excluding 83 (30.5\%) who met the exclusion criteria. Of the 189 patients included, 145 (76.7\%) were used as training data to build the machine learning (ML) model and 44 (23.3\%) were used as test data for predictive ability of the ML model. Using a questionnaire, 10 cardiologists (group A) evaluated whether the test data (44 patients) included AF of more than a 1-year duration (phase 1). Next, the same questionnaire was performed again after providing the ML model's answer (phase 2). Subsequently, another 10 cardiologists (group B) were shown the test results of group A, were made aware of the limitations of their own diagnostic abilities, and were then administered the same 2-stage test as group A. Results: The prediction results with the ML model using the test data provided 81.8\% accuracy (72\% sensitivity and 89\% specificity). The mean percentage of correct answers in group A was 63.9\% (SD 9.6\%) for phase 1 and improved to 71.6\% (SD 9.3\%) for phase 2 (P=.01). The mean percentage of correct answers in group B was 59.8\% (SD 5.3\%) for phase 1 and improved to 68.2\% (SD 5.9\%) for phase 2 (P=.007). The mean percentage of answers that differed from the ML model's prediction for phase 2 (percentage of answers where cardiologists did not trust the ML model and believed their own determination) was 17.3\% (SD 10.3\%) in group A and 20.9\% (SD 5\%) in group B and was not significantly different (P=.85). Conclusions: ML models predicting AF duration improved the diagnostic ability of cardiologists. However, cardiologists did not entirely rely on the ML model's prediction, even if they were aware of their diagnostic capability limitations. ", doi="10.2196/63795", url="https://medinform.jmir.org/2024/1/e63795" } @Article{info:doi/10.2196/50235, author="Jefferson, Emily and Milligan, Gordon and Johnston, Jenny and Mumtaz, Shahzad and Cole, Christian and Best, Joseph and Giles, Charles Thomas and Cox, Samuel and Masood, Erum and Horban, Scott and Urwin, Esmond and Beggs, Jillian and Chuter, Antony and Reilly, Gerry and Morris, Andrew and Seymour, David and Hopkins, Susan and Sheikh, Aziz and Quinlan, Philip", title="The Challenges and Lessons Learned Building a New UK Infrastructure for Finding and Accessing Population-Wide COVID-19 Data for Research and Public Health Analysis: The CO-CONNECT Project", journal="J Med Internet Res", year="2024", month="Nov", day="20", volume="26", pages="e50235", keywords="COVID-19", keywords="infrastructure", keywords="trusted research environments", keywords="safe havens", keywords="feasibility analysis", keywords="cohort discovery", keywords="federated analytics", keywords="federated discovery", keywords="lessons learned", keywords="population wide", keywords="data", keywords="public health", keywords="analysis", keywords="CO-CONNECT", keywords="challenges", keywords="data transformation", doi="10.2196/50235", url="https://www.jmir.org/2024/1/e50235" } @Article{info:doi/10.2196/57754, author="Liu, Shuimei and Guo, Raymond L.", title="Data Ownership in the AI-Powered Integrative Health Care Landscape", journal="JMIR Med Inform", year="2024", month="Nov", day="19", volume="12", pages="e57754", keywords="data ownership", keywords="integrative healthcare", keywords="artificial intelligence", keywords="AI", keywords="ownership", keywords="data science", keywords="governance", keywords="consent", keywords="privacy", keywords="security", keywords="access", keywords="model", keywords="framework", keywords="transparency", doi="10.2196/57754", url="https://medinform.jmir.org/2024/1/e57754" } @Article{info:doi/10.2196/55148, author="Brehmer, Alexander and Sauer, Martin Christopher and Salazar Rodr{\'i}guez, Jayson and Herrmann, Kelsey and Kim, Moon and Keyl, Julius and Bahnsen, Hendrik Fin and Frank, Benedikt and K{\"o}hrmann, Martin and Rassaf, Tienush and Mahabadi, Amir-Abbas and Hadaschik, Boris and Darr, Christopher and Herrmann, Ken and Tan, Susanne and Buer, Jan and Brenner, Thorsten and Reinhardt, Christian Hans and Nensa, Felix and Gertz, Michael and Egger, Jan and Kleesiek, Jens", title="Establishing Medical Intelligence---Leveraging Fast Healthcare Interoperability Resources to Improve Clinical Management: Retrospective Cohort and Clinical Implementation Study", journal="J Med Internet Res", year="2024", month="Oct", day="31", volume="26", pages="e55148", keywords="clinical informatics", keywords="FHIR", keywords="real-world evidence", keywords="medical intelligence", keywords="interoperability", keywords="data exchange", keywords="clinical management", keywords="clinical decision-making", keywords="electronic health records", keywords="quality of care", keywords="quality improvement", abstract="Background: FHIR (Fast Healthcare Interoperability Resources) has been proposed to enable health data interoperability. So far, its applicability has been demonstrated for selected research projects with limited data. Objective: This study aimed to design and implement a conceptual medical intelligence framework to leverage real-world care data for clinical decision-making. Methods: A Python package for the use of multimodal FHIR data (FHIRPACK [FHIR Python Analysis Conversion Kit]) was developed and pioneered in 5 real-world clinical use cases, that is, myocardial infarction, stroke, diabetes, sepsis, and prostate cancer. Patients were identified based on the ICD-10 (International Classification of Diseases, Tenth Revision) codes, and outcomes were derived from laboratory tests, prescriptions, procedures, and diagnostic reports. Results were provided as browser-based dashboards. Results: For 2022, a total of 1,302,988 patient encounters were analyzed. (1) Myocardial infarction: in 72.7\% (261/359) of cases, medication regimens fulfilled guideline recommendations. (2) Stroke: out of 1277 patients, 165 received thrombolysis and 108 thrombectomy. (3) Diabetes: in 443,866 serum glucose and 16,180 glycated hemoglobin A1c measurements from 35,494 unique patients, the prevalence of dysglycemic findings was 39\% (13,887/35,494). Among those with dysglycemia, diagnosis was coded in 44.2\% (6138/13,887) of the patients. (4) Sepsis: In 1803 patients, Staphylococcus epidermidis was the primarily isolated pathogen (773/2672, 28.9\%) and piperacillin and tazobactam was the primarily prescribed antibiotic (593/1593, 37.2\%). (5) PC: out of 54, three patients who received radical prostatectomy were identified as cases with prostate-specific antigen persistence or biochemical recurrence. Conclusions: Leveraging FHIR data through large-scale analytics can enhance health care quality and improve patient outcomes across 5 clinical specialties. We identified (1) patients with sepsis requiring less broad antibiotic therapy, (2) patients with myocardial infarction who could benefit from statin and antiplatelet therapy, (3) patients who had a stroke with longer than recommended times to intervention, (4) patients with hyperglycemia who could benefit from specialist referral, and (5) patients with PC with early increases in cancer markers. ", doi="10.2196/55148", url="https://www.jmir.org/2024/1/e55148", url="http://www.ncbi.nlm.nih.gov/pubmed/39240144" } @Article{info:doi/10.2196/53636, author="Bardhan, Jayetri and Roberts, Kirk and Wang, Zhe Daisy", title="Question Answering for Electronic Health Records: Scoping Review of Datasets and Models", journal="J Med Internet Res", year="2024", month="Oct", day="30", volume="26", pages="e53636", keywords="medical question answering", keywords="electronic health record", keywords="EHR", keywords="electronic medical records", keywords="EMR", keywords="relational database", keywords="knowledge graph", abstract="Background: Question answering (QA) systems for patient-related data can assist both clinicians and patients. They can, for example, assist clinicians in decision-making and enable patients to have a better understanding of their medical history. Substantial amounts of patient data are stored in electronic health records (EHRs), making EHR QA an important research area. Because of the differences in data format and modality, this differs greatly from other medical QA tasks that use medical websites or scientific papers to retrieve answers, making it critical to research EHR QA. Objective: This study aims to provide a methodological review of existing works on QA for EHRs. The objectives of this study were to identify the existing EHR QA datasets and analyze them, study the state-of-the-art methodologies used in this task, compare the different evaluation metrics used by these state-of-the-art models, and finally elicit the various challenges and the ongoing issues in EHR QA. Methods: We searched for articles from January 1, 2005, to September 30, 2023, in 4 digital sources, including Google Scholar, ACL Anthology, ACM Digital Library, and PubMed, to collect relevant publications on EHR QA. Our systematic screening process followed PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines. A total of 4111 papers were identified for our study, and after screening based on our inclusion criteria, we obtained 47 papers for further study. The selected studies were then classified into 2 non--mutually exclusive categories depending on their scope: ``EHR QA datasets'' and ``EHR QA models.'' Results: A systematic screening process obtained 47 papers on EHR QA for final review. Out of the 47 papers, 53\% (n=25) were about EHR QA datasets, and 79\% (n=37) papers were about EHR QA models. It was observed that QA on EHRs is relatively new and unexplored. Most of the works are fairly recent. In addition, it was observed that emrQA is by far the most popular EHR QA dataset, both in terms of citations and usage in other papers. We have classified the EHR QA datasets based on their modality, and we have inferred that Medical Information Mart for Intensive Care (MIMIC-III) and the National Natural Language Processing Clinical Challenges datasets (ie, n2c2 datasets) are the most popular EHR databases and corpuses used in EHR QA. Furthermore, we identified the different models used in EHR QA along with the evaluation metrics used for these models. Conclusions: EHR QA research faces multiple challenges, such as the limited availability of clinical annotations, concept normalization in EHR QA, and challenges faced in generating realistic EHR QA datasets. There are still many gaps in research that motivate further work. This study will assist future researchers in focusing on areas of EHR QA that have possible future research directions. ", doi="10.2196/53636", url="https://www.jmir.org/2024/1/e53636" } @Article{info:doi/10.2196/54246, author="Paiva, Bruno and Gon{\c{c}}alves, Andr{\'e} Marcos and da Rocha, Dutra Leonardo Chaves and Marcolino, Soriano Milena and Lana, Barbosa Fernanda Cristina and Souza-Silva, Rego Maira Viana and Almeida, M. Jussara and Pereira, Delfino Polianna and de Andrade, Valiense Claudio Mois{\'e}s and Gomes, Reis Ang{\'e}lica Gomides dos and Ferreira, Pires Maria Ang{\'e}lica and Bartolazzi, Frederico and Sacioto, Furtado Manuela and Boscato, Paula Ana and Guimar{\~a}es-J{\'u}nior, Henriques Milton and dos Reis, Pereira Priscilla and Costa, Roberto Fel{\'i}cio and Jorge, Oliveira Alzira de and Coelho, Reis Laryssa and Carneiro, Marcelo and Sales, Souza Tha{\'i}s Lorenna and Ara{\'u}jo, Ferreira Silvia and Silveira, Vit{\'o}rio Daniel and Ruschel, Brasil Karen and Santos, Veloso Fernanda Caldeira and Cenci, Almeida Evelin Paola de and Menezes, Monteiro Luanna Silva and Anschau, Fernando and Bicalho, Camargos Maria Aparecida and Manenti, Fernandes Euler Roberto and Finger, Goulart Renan and Ponce, Daniela and de Aguiar, Carrilho Filipe and Marques, Margoto Luiza and de Castro, C{\'e}sar Lu{\'i}s and Vietta, Gr{\"u}newald Giovanna and Godoy, de Mariana Frizzo and Vila{\c{c}}a, Nascimento Mariana do and Morais, Costa Vivian", title="A New Natural Language Processing--Inspired Methodology (Detection, Initial Characterization, and Semantic Characterization) to Investigate Temporal Shifts (Drifts) in Health Care Data: Quantitative Study", journal="JMIR Med Inform", year="2024", month="Oct", day="28", volume="12", pages="e54246", keywords="health care", keywords="machine learning", keywords="data drifts", keywords="temporal drifts", abstract="Background: Proper analysis and interpretation of health care data can significantly improve patient outcomes by enhancing services and revealing the impacts of new technologies and treatments. Understanding the substantial impact of temporal shifts in these data is crucial. For example, COVID-19 vaccination initially lowered the mean age of at-risk patients and later changed the characteristics of those who died. This highlights the importance of understanding these shifts for assessing factors that affect patient outcomes. Objective: This study aims to propose detection, initial characterization, and semantic characterization (DIS), a new methodology for analyzing?changes in health outcomes and variables over time while discovering contextual changes for outcomes in large volumes of data. Methods: The DIS methodology involves 3 steps: detection, initial characterization, and semantic characterization. Detection uses metrics such as Jensen-Shannon divergence to identify significant data drifts. Initial characterization offers a global analysis of changes in data distribution and predictive feature significance over time. Semantic characterization uses natural language processing--inspired techniques to understand the local context of these changes, helping identify factors driving changes in patient outcomes. By integrating the outcomes from these 3 steps, our results can identify specific factors (eg, interventions and modifications in health care practices) that drive changes in patient outcomes. DIS was applied to the Brazilian COVID-19 Registry and the Medical Information Mart for Intensive Care, version IV (MIMIC-IV) data sets. Results: Our approach allowed us to (1) identify drifts effectively, especially using metrics such as the Jensen-Shannon divergence, and (2) uncover reasons for the decline in overall mortality in both the COVID-19 and MIMIC-IV data sets, as well as changes in the cooccurrence between different diseases and this particular outcome. Factors such as vaccination during the COVID-19 pandemic and reduced iatrogenic events and cancer-related deaths in MIMIC-IV were highlighted. The methodology also pinpointed shifts in patient demographics and disease patterns, providing insights into the evolving health care landscape during the study period. Conclusions: We developed a novel methodology combining machine learning?and natural language processing techniques to detect, characterize, and understand temporal shifts in health care data. This understanding can enhance predictive algorithms, improve patient outcomes, and optimize health care resource allocation, ultimately?improving the effectiveness of machine learning predictive algorithms applied to health care data. Our methodology can be applied to a variety of scenarios beyond those discussed in this paper. ", doi="10.2196/54246", url="https://medinform.jmir.org/2024/1/e54246" } @Article{info:doi/10.2196/60293, author="Hu, Zhengyong and Wang, Anran and Duan, Yifan and Zhou, Jiayin and Hu, Wanfei and Wu, Sizhu", title="Toward Better Semantic Interoperability of Data Element Repositories in Medicine: Analysis Study", journal="JMIR Med Inform", year="2024", month="Sep", day="30", volume="12", pages="e60293", keywords="data element repository", keywords="FAIR", keywords="ISO/IEC 11179", keywords="metadata", keywords="semantic interoperability", abstract="Background: Data element repositories facilitate high-quality medical data sharing by standardizing data and enhancing semantic interoperability. However, the application of repositories is confined to specific projects and institutions. Objective: This study aims to explore potential issues and promote broader application of data element repositories within the medical field by evaluating and analyzing typical repositories. Methods: Following the inclusion of 5 data element repositories through a literature review, a novel analysis framework consisting of 7 dimensions and 36 secondary indicators was constructed and used for evaluation and analysis. Results: The study's results delineate the unique characteristics of different repositories and uncover specific issues in their construction. These issues include the absence of data reuse protocols and insufficient information regarding the application scenarios and efficacy of data elements. The repositories fully comply with only 45\% (9/20) of the subprinciples for Findable and Reusable in the FAIR principle, while achieving a 90\% (19/20 subprinciples) compliance rate for Accessible and 67\% (10/15 subprinciples) for Interoperable. Conclusions: The recommendations proposed in this study address the issues to improve the construction and application of repositories, offering valuable insights to data managers, computer experts, and other pertinent stakeholders. ", doi="10.2196/60293", url="https://medinform.jmir.org/2024/1/e60293", url="http://www.ncbi.nlm.nih.gov/pubmed/39348178" } @Article{info:doi/10.2196/59505, author="AlSaad, Rawan and Abd-alrazaq, Alaa and Boughorbel, Sabri and Ahmed, Arfan and Renault, Max-Antoine and Damseh, Rafat and Sheikh, Javaid", title="Multimodal Large Language Models in Health Care: Applications, Challenges, and Future Outlook", journal="J Med Internet Res", year="2024", month="Sep", day="25", volume="26", pages="e59505", keywords="artificial intelligence", keywords="large language models", keywords="multimodal large language models", keywords="multimodality", keywords="multimodal generative artificial intelligence", keywords="multimodal generative AI", keywords="generative artificial intelligence", keywords="generative AI", keywords="health care", doi="10.2196/59505", url="https://www.jmir.org/2024/1/e59505" } @Article{info:doi/10.2196/59392, author="Brahma, Arindam and Chatterjee, Samir and Seal, Kala and Fitzpatrick, Ben and Tao, Youyou", title="Development of a Cohort Analytics Tool for Monitoring Progression Patterns in Cardiovascular Diseases: Advanced Stochastic Modeling Approach", journal="JMIR Med Inform", year="2024", month="Sep", day="24", volume="12", pages="e59392", keywords="healthcare analytics", keywords="eHealth", keywords="disease monitoring", keywords="cardiovascular disease", keywords="disease progression model", keywords="myocardial", keywords="stroke", keywords="decision support", keywords="continuous-time Markov chain model", keywords="stochastic model", keywords="stochastic", keywords="Markov", keywords="cardiology", keywords="cardiovascular", keywords="heart", keywords="monitoring", keywords="progression", abstract="Background: The World Health Organization (WHO) reported that cardiovascular diseases (CVDs) are the leading cause of death worldwide. CVDs are chronic, with complex progression patterns involving episodes of comorbidities and multimorbidities. When dealing with chronic diseases, physicians often adopt a ``watchful waiting'' strategy, and actions are postponed until information is available. Population-level transition probabilities and progression patterns can be revealed by applying time-variant stochastic modeling methods to longitudinal patient data from cohort studies. Inputs from CVD practitioners indicate that tools to generate and visualize cohort transition patterns have many impactful clinical applications. The resultant computational model can be embedded in digital decision support tools for clinicians. However, to date, no study has attempted to accomplish this for CVDs. Objective: This study aims to apply advanced stochastic modeling methods to uncover the transition probabilities and progression patterns from longitudinal episodic data of patient cohorts with CVD and thereafter use the computational model to build a digital clinical cohort analytics artifact demonstrating the actionability of such models. Methods: Our data were sourced from 9 epidemiological cohort studies by the National Heart Lung and Blood Institute and comprised chronological records of 1274 patients associated with 4839 CVD episodes across 16 years. We then used the continuous-time Markov chain method to develop our model, which offers a robust approach to time-variant transitions between disease states in chronic diseases. Results: Our study presents time-variant transition probabilities of CVD state changes, revealing patterns of CVD progression against time. We found that the transition from myocardial infarction (MI) to stroke has the fastest transition rate (mean transition time 3, SD 0 days, because only 1 patient had a MI-to-stroke transition in the dataset), and the transition from MI to angina is the slowest (mean transition time 1457, SD 1449 days). Congestive heart failure is the most probable first episode (371/840, 44.2\%), followed by stroke (216/840, 25.7\%). The resultant artifact is actionable as it can act as an eHealth cohort analytics tool, helping physicians gain insights into treatment and intervention strategies. Through expert panel interviews and surveys, we found 9 application use cases of our model. Conclusions: Past research does not provide actionable cohort-level decision support tools based on a comprehensive, 10-state, continuous-time Markov chain model to unveil complex CVD progression patterns from real-world patient data and support clinical decision-making. This paper aims to address this crucial limitation. Our stochastic model--embedded artifact can help clinicians in efficient disease monitoring and intervention decisions, guided by objective data-driven insights from real patient data. Furthermore, the proposed model can unveil progression patterns of any chronic disease of interest by inputting only 3 data elements: a synthetic patient identifier, episode name, and episode time in days from a baseline date. ", doi="10.2196/59392", url="https://medinform.jmir.org/2024/1/e59392", url="http://www.ncbi.nlm.nih.gov/pubmed/39316426" } @Article{info:doi/10.2196/52837, author="Xu, Lingyu and Li, Chenyu and Gao, Shuang and Zhao, Long and Guan, Chen and Shen, Xuefei and Zhu, Zhihui and Guo, Cheng and Zhang, Liwei and Yang, Chengyu and Bu, Quandong and Zhou, Bin and Xu, Yan", title="Personalized Prediction of Long-Term Renal Function Prognosis Following Nephrectomy Using Interpretable Machine Learning Algorithms: Case-Control Study", journal="JMIR Med Inform", year="2024", month="Sep", day="20", volume="12", pages="e52837", keywords="nephrectomy", keywords="acute kidney injury", keywords="acute kidney disease", keywords="chronic kidney disease", keywords="machine learning", abstract="Background: Acute kidney injury (AKI) is a common adverse outcome following nephrectomy. The progression from AKI to acute kidney disease (AKD) and subsequently to chronic kidney disease (CKD) remains a concern; yet, the predictive mechanisms for these transitions are not fully understood. Interpretable machine learning (ML) models offer insights into how clinical features influence long-term renal function outcomes after nephrectomy, providing a more precise framework for identifying patients at risk and supporting improved clinical decision-making processes. Objective: This study aimed to (1) evaluate postnephrectomy rates of AKI, AKD, and CKD, analyzing long-term renal outcomes along different trajectories; (2) interpret AKD and CKD models using Shapley Additive Explanations values and Local Interpretable Model-Agnostic Explanations algorithm; and (3) develop a web-based tool for estimating AKD or CKD risk after nephrectomy. Methods: We conducted a retrospective cohort study involving patients who underwent nephrectomy between July 2012 and June 2019. Patient data were randomly split into training, validation, and test sets, maintaining a ratio of 76.5:8.5:15. Eight ML algorithms were used to construct predictive models for postoperative AKD and CKD. The performance of the best-performing models was assessed using various metrics. We used various Shapley Additive Explanations plots and Local Interpretable Model-Agnostic Explanations bar plots to interpret the model and generated directed acyclic graphs to explore the potential causal relationships between features. Additionally, we developed a web-based prediction tool using the top 10 features for AKD prediction and the top 5 features for CKD prediction. Results: The study cohort comprised 1559 patients. Incidence rates for AKI, AKD, and CKD were 21.7\% (n=330), 15.3\% (n=238), and 10.6\% (n=165), respectively. Among the evaluated ML models, the Light Gradient-Boosting Machine (LightGBM) model demonstrated superior performance, with an area under the receiver operating characteristic curve of 0.97 for AKD prediction and 0.96 for CKD prediction. Performance metrics and plots highlighted the model's competence in discrimination, calibration, and clinical applicability. Operative duration, hemoglobin, blood loss, urine protein, and hematocrit were identified as the top 5 features associated with predicted AKD. Baseline estimated glomerular filtration rate, pathology, trajectories of renal function, age, and total bilirubin were the top 5 features associated with predicted CKD. Additionally, we developed a web application using the LightGBM model to estimate AKD and CKD risks. Conclusions: An interpretable ML model effectively elucidated its decision-making process in identifying patients at risk of AKD and CKD following nephrectomy by enumerating critical features. The web-based calculator, found on the LightGBM model, can assist in formulating more personalized and evidence-based clinical strategies. ", doi="10.2196/52837", url="https://medinform.jmir.org/2024/1/e52837" } @Article{info:doi/10.2196/59858, author="Yamashita, Kouhei and Nomoto, Yuji and Hirose, Tomoya and Yutani, Akira and Okada, Akira and Watanabe, Nayu and Suzuki, Ken and Senzaki, Munenori and Kuroda, Tomohiro", title="Early Diagnosis of Hereditary Angioedema in Japan Based on a US Medical Dataset: Algorithm Development and Validation", journal="JMIR Med Inform", year="2024", month="Sep", day="13", volume="12", pages="e59858", keywords="machine learning", keywords="screening", keywords="AI", keywords="prediction", keywords="rare diseases", keywords="HAE", keywords="electronic medical record", keywords="real world data", keywords="big data", keywords="angioedema", keywords="edema", keywords="ML", keywords="artificial intelligence", keywords="algorithm", keywords="algorithms", keywords="predictive model", keywords="predictive models", keywords="predictive analytics", keywords="predictive system", keywords="practical model", keywords="practical models", keywords="early warning", keywords="early detection", keywords="RWD", keywords="Electronic health record", keywords="EHR", keywords="electronic health records", keywords="EHRs", keywords="EMR", keywords="electronic medical records", keywords="EMRs", keywords="patient record", keywords="health record", keywords="health records", keywords="personal health record", keywords="PHR", abstract="Background: Hereditary angioedema (HAE), a rare genetic disease, induces acute attacks of swelling in various regions of the body. Its prevalence is estimated to be 1 in 50,000 people, with no reported bias among different ethnic groups. However, considering the estimated prevalence, the number of patients in Japan diagnosed with HAE remains approximately 1 in 250,000, which means that only 20\% of potential HAE cases are identified. Objective: This study aimed to develop an artificial intelligence (AI) model that can detect patients with suspected HAE using medical history data (medical claims, prescriptions, and electronic medical records [EMRs]) in the United States. We also aimed to validate the detection performance of the model for HAE cases using the Japanese dataset. Methods: The HAE patient and control groups were identified using the US claims and EMR datasets. We analyzed the characteristics of the diagnostic history of patients with HAE and developed an AI model to predict the probability of HAE based on a generalized linear model and bootstrap method. The model was then applied to the EMR data of the Kyoto University Hospital to verify its applicability to the Japanese dataset. Results: Precision and sensitivity were measured to validate the model performance. Using the comprehensive US dataset, the precision score was 2\% in the initial model development step. Our model can screen out suspected patients, where 1 in 50 of these patients have HAE. In addition, in the validation step with Japanese EMR data, the precision score was 23.6\%, which exceeded our expectations. We achieved a sensitivity score of 61.5\% for the US dataset and 37.6\% for the validation exercise using data from a single Japanese hospital. Overall, our model could predict patients with typical HAE symptoms. Conclusions: This study indicates that our AI model can detect HAE in patients with typical symptoms and is effective in Japanese data. However, further prospective clinical studies are required to investigate whether this model can be used to diagnose HAE. ", doi="10.2196/59858", url="https://medinform.jmir.org/2024/1/e59858" } @Article{info:doi/10.2196/49997, author="Wen, Andrew and Wang, Liwei and He, Huan and Fu, Sunyang and Liu, Sijia and Hanauer, A. David and Harris, R. Daniel and Kavuluru, Ramakanth and Zhang, Rui and Natarajan, Karthik and Pavinkurve, P. Nishanth and Hajagos, Janos and Rajupet, Sritha and Lingam, Veena and Saltz, Mary and Elowsky, Corey and Moffitt, A. Richard and Koraishy, M. Farrukh and Palchuk, B. Matvey and Donovan, Jordan and Lingrey, Lora and Stone-DerHagopian, Garo and Miller, T. Robert and Williams, E. Andrew and Leese, J. Peter and Kovach, I. Paul and Pfaff, R. Emily and Zemmel, Mikhail and Pates, D. Robert and Guthe, Nick and Haendel, A. Melissa and Chute, G. Christopher and Liu, Hongfang and ", title="A Case Demonstration of the Open Health Natural Language Processing Toolkit From the National COVID-19 Cohort Collaborative and the Researching COVID to Enhance Recovery Programs for a Natural Language Processing System for COVID-19 or Postacute Sequelae of SARS CoV-2 Infection: Algorithm Development and Validation", journal="JMIR Med Inform", year="2024", month="Sep", day="9", volume="12", pages="e49997", keywords="natural language processing", keywords="clinical information extraction", keywords="clinical phenotyping", keywords="extract", keywords="extraction", keywords="NLP", keywords="phenotype", keywords="phenotyping", keywords="narratives", keywords="unstructured", keywords="PASC", keywords="COVID", keywords="COVID-19", keywords="SARS-CoV-2", keywords="OHNLP", keywords="Open Health Natural Language Processing", abstract="Background: A wealth of clinically relevant information is only obtainable within unstructured clinical narratives, leading to great interest in clinical natural language processing (NLP). While a multitude of approaches to NLP exist, current algorithm development approaches have limitations that can slow the development process. These limitations are exacerbated when the task is emergent, as is the case currently for NLP extraction of signs and symptoms of COVID-19 and postacute sequelae of SARS-CoV-2 infection (PASC). Objective: This study aims to highlight the current limitations of existing NLP algorithm development approaches that are exacerbated by NLP tasks surrounding emergent clinical concepts and to illustrate our approach to addressing these issues through the use case of developing an NLP system for the signs and symptoms of COVID-19 and PASC. Methods: We used 2 preexisting studies on PASC as a baseline to determine a set of concepts that should be extracted by NLP. This concept list was then used in conjunction with the Unified Medical Language System to autonomously generate an expanded lexicon to weakly annotate a training set, which was then reviewed by a human expert to generate a fine-tuned NLP algorithm. The annotations from a fully human-annotated test set were then compared with NLP results from the fine-tuned algorithm. The NLP algorithm was then deployed to 10 additional sites that were also running our NLP infrastructure. Of these 10 sites, 5 were used to conduct a federated evaluation of the NLP algorithm. Results: An NLP algorithm consisting of 12,234 unique normalized text strings corresponding to 2366 unique concepts was developed to extract COVID-19 or PASC signs and symptoms. An unweighted mean dictionary coverage of 77.8\% was found for the 5 sites. Conclusions: The evolutionary and time-critical nature of the PASC NLP task significantly complicates existing approaches to NLP algorithm development. In this work, we present a hybrid approach using the Open Health Natural Language Processing Toolkit aimed at addressing these needs with a dictionary-based weak labeling step that minimizes the need for additional expert annotation while still preserving the fine-tuning capabilities of expert involvement. ", doi="10.2196/49997", url="https://medinform.jmir.org/2024/1/e49997" } @Article{info:doi/10.2196/59617, author="Heilmeyer, Felix and B{\"o}hringer, Daniel and Reinhard, Thomas and Arens, Sebastian and Lyssenko, Lisa and Haverkamp, Christian", title="Viability of Open Large Language Models for Clinical Documentation in German Health Care: Real-World Model Evaluation Study", journal="JMIR Med Inform", year="2024", month="Aug", day="28", volume="12", pages="e59617", keywords="machine learning", keywords="ML", keywords="artificial intelligence", keywords="AI", keywords="large language model", keywords="large language models", keywords="LLM", keywords="LLMs", keywords="natural language processing", keywords="NLP", keywords="deep learning", keywords="algorithm", keywords="algorithms", keywords="model", keywords="models", keywords="analytics", keywords="practical model", keywords="practical models", keywords="medical documentation", keywords="writing assistance", keywords="medical administration", keywords="writing assistance for physicians", abstract="Background: The use of large language models (LLMs) as writing assistance for medical professionals is a promising approach to reduce the time required for documentation, but there may be practical, ethical, and legal challenges in many jurisdictions complicating the use of the most powerful commercial LLM solutions. Objective: In this study, we assessed the feasibility of using nonproprietary LLMs of the GPT variety as writing assistance for medical professionals in an on-premise setting with restricted compute resources, generating German medical text. Methods: We trained four 7-billion--parameter models with 3 different architectures for our task and evaluated their performance using a powerful commercial LLM, namely Anthropic's Claude-v2, as a rater. Based on this, we selected the best-performing model and evaluated its practical usability with 2 independent human raters on real-world data. Results: In the automated evaluation with Claude-v2, BLOOM-CLP-German, a model trained from scratch on the German text, achieved the best results. In the manual evaluation by human experts, 95 (93.1\%) of the 102 reports generated by that model were evaluated as usable as is or with only minor changes by both human raters. Conclusions: The results show that even with restricted compute resources, it is possible to generate medical texts that are suitable for documentation in routine clinical practice. However, the target language should be considered in the model selection when processing non-English text. ", doi="10.2196/59617", url="https://medinform.jmir.org/2024/1/e59617" } @Article{info:doi/10.2196/56734, author="Sood, Dua Priyanka and Liu, Star and Lehmann, Harold and Kharrazi, Hadi", title="Assessing the Effect of Electronic Health Record Data Quality on Identifying Patients With Type 2 Diabetes: Cross-Sectional Study", journal="JMIR Med Inform", year="2024", month="Aug", day="27", volume="12", pages="e56734", keywords="electronic health record", keywords="EHR", keywords="EHRs", keywords="record", keywords="records", keywords="computable", keywords="phenotyping", keywords="phenotype", keywords="phenotypes", keywords="computable phenotypes", keywords="data quality", keywords="data science", keywords="chronic", keywords="identify", keywords="identification", keywords="data types---diagnosis data, medication data, laboratory data", keywords="type-2 diabetes", keywords="diabetes", keywords="diabetic", keywords="DM", keywords="type 2", keywords="hospital system", keywords="clinical research and trial", keywords="diagnosis", keywords="diagnoses", keywords="diagnose", keywords="diagnostic", keywords="diagnostics", keywords="phenotypic", abstract="Background: Increasing and substantial reliance on electronic health records (EHRs) and data types (ie, diagnosis, medication, and laboratory data) demands assessment of their data quality as a fundamental approach, especially since there is a need to identify appropriate denominator populations with chronic conditions, such as type 2 diabetes (T2D), using commonly available computable phenotype definitions (ie, phenotypes). Objective: To bridge this gap, our study aims to assess how issues of EHR data quality and variations and robustness (or lack thereof) in phenotypes may have potential impacts in identifying denominator populations. Methods: Approximately 208,000 patients with T2D were included in our study, which used retrospective EHR data from the Johns Hopkins Medical Institution (JHMI) during 2017?2019. Our assessment included 4 published phenotypes and 1 definition from a panel of experts at Hopkins. We conducted descriptive analyses of demographics (ie, age, sex, race, and ethnicity), use of health care (inpatient and emergency room visits), and the average Charlson Comorbidity Index score of each phenotype. We then used different methods to induce or simulate data quality issues of completeness, accuracy, and timeliness separately across each phenotype. For induced data incompleteness, our model randomly dropped diagnosis, medication, and laboratory codes independently at increments of 10\%; for induced data inaccuracy, our model randomly replaced a diagnosis or medication code with another code of the same data type and induced 2\% incremental change from ?100\% to +10\% in laboratory result values; and lastly, for timeliness, data were modeled for induced incremental shift of date records by 30 days to 365 days. Results: Less than a quarter (n=47,326, 23\%) of the population overlapped across all phenotypes using EHRs. The population identified by each phenotype varied across all combinations of data types. Induced incompleteness identified fewer patients with each increment; for example, at 100\% diagnostic incompleteness, the Chronic Conditions Data Warehouse phenotype identified zero patients, as its phenotypic characteristics included only diagnosis codes. Induced inaccuracy and timeliness similarly demonstrated variations in performance of each phenotype, therefore resulting in fewer patients being identified with each incremental change. Conclusions: We used EHR data with diagnosis, medication, and laboratory data types from a large tertiary hospital system to understand T2D phenotypic differences and performance. We used induced data quality methods to learn how data quality issues may impact identification of the denominator populations upon which clinical (eg, clinical research and trials, population health evaluations) and financial or operational decisions are made. The novel results from our study may inform future approaches to shaping a common T2D computable phenotype definition that can be applied to clinical informatics, managing chronic conditions, and additional industry-wide efforts in health care. ", doi="10.2196/56734", url="https://medinform.jmir.org/2024/1/e56734" } @Article{info:doi/10.2196/51297, author="Gierend, Kerstin and Kr{\"u}ger, Frank and Genehr, Sascha and Hartmann, Francisca and Siegel, Fabian and Waltemath, Dagmar and Ganslandt, Thomas and Zeleke, Alamirrew Atinkut", title="Provenance Information for Biomedical Data and Workflows: Scoping Review", journal="J Med Internet Res", year="2024", month="Aug", day="23", volume="26", pages="e51297", keywords="provenance", keywords="biomedical research", keywords="data management", keywords="scoping review", keywords="health care data", keywords="software life cycle", abstract="Background: The record of the origin and the history of data, known as provenance, holds importance. Provenance information leads to higher interpretability of scientific results and enables reliable collaboration and data sharing. However, the lack of comprehensive evidence on provenance approaches hinders the uptake of good scientific practice in clinical research. Objective: This scoping review aims to identify approaches and criteria for provenance tracking in the biomedical domain. We reviewed the state-of-the-art frameworks, associated artifacts, and methodologies for provenance tracking. Methods: This scoping review followed the methodological framework developed by Arksey and O'Malley. We searched the PubMed and Web of Science databases for English-language articles published from 2006 to 2022. Title and abstract screening were carried out by 4 independent reviewers using the Rayyan screening tool. A majority vote was required for consent on the eligibility of papers based on the defined inclusion and exclusion criteria. Full-text reading and screening were performed independently by 2 reviewers, and information was extracted into a pretested template for the 5 research questions. Disagreements were resolved by a domain expert. The study protocol has previously been published. Results: The search resulted in a total of 764 papers. Of 624 identified, deduplicated papers, 66 (10.6\%) studies fulfilled the inclusion criteria. We identified diverse provenance-tracking approaches ranging from practical provenance processing and managing to theoretical frameworks distinguishing diverse concepts and details of data and metadata models, provenance components, and notations. A substantial majority investigated underlying requirements to varying extents and validation intensities but lacked completeness in provenance coverage. Mostly, cited requirements concerned the knowledge about data integrity and reproducibility. Moreover, these revolved around robust data quality assessments, consistent policies for sensitive data protection, improved user interfaces, and automated ontology development. We found that different stakeholder groups benefit from the availability of provenance information. Thereby, we recognized that the term provenance is subjected to an evolutionary and technical process with multifaceted meanings and roles. Challenges included organizational and technical issues linked to data annotation, provenance modeling, and performance, amplified by subsequent matters such as enhanced provenance information and quality principles. Conclusions: As data volumes grow and computing power increases, the challenge of scaling provenance systems to handle data efficiently and assist complex queries intensifies, necessitating automated and scalable solutions. With rising legal and scientific demands, there is an urgent need for greater transparency in implementing provenance systems in research projects, despite the challenges of unresolved granularity and knowledge bottlenecks. We believe that our recommendations enable quality and guide the implementation of auditable and measurable provenance approaches as well as solutions in the daily tasks of biomedical scientists. International Registered Report Identifier (IRRID): RR2-10.2196/31750 ", doi="10.2196/51297", url="https://www.jmir.org/2024/1/e51297" } @Article{info:doi/10.2196/57615, author="Lighterness, Anthony and Adcock, Michael and Scanlon, Abigail Lauren and Price, Gareth", title="Data Quality--Driven Improvement in Health Care: Systematic Literature Review", journal="J Med Internet Res", year="2024", month="Aug", day="22", volume="26", pages="e57615", keywords="real-world data", keywords="data quality", keywords="quality improvement", keywords="systematic literature review", keywords="PRISMA", abstract="Background: The promise of real-world evidence and the learning health care system primarily depends on access to high-quality data. Despite widespread awareness of the prevalence and potential impacts of poor data quality (DQ), best practices for its assessment and improvement are unknown. Objective: This review aims to investigate how existing research studies define, assess, and improve the quality of structured real-world health care data. Methods: A systematic literature search of studies in the English language was implemented in the Embase and PubMed databases to select studies that specifically aimed to measure and improve the quality of structured real-world data within any clinical setting. The time frame for the analysis was from January 1945 to June 2023. We standardized DQ concepts according to the Data Management Association (DAMA) DQ framework to enable comparison between studies. After screening and filtering by 2 independent authors, we identified 39 relevant articles reporting DQ improvement initiatives. Results: The studies were characterized by considerable heterogeneity in settings and approaches to DQ assessment and improvement. Affiliated institutions were from 18 different countries and 18 different health domains. DQ assessment methods were largely manual and targeted completeness and 1 other DQ dimension. Use of DQ frameworks was limited to the Weiskopf and Weng (3/6, 50\%) or Kahn harmonized model (3/6, 50\%). Use of standardized methodologies to design and implement quality improvement was lacking, but mainly included plan-do-study-act (PDSA) or define-measure-analyze-improve-control (DMAIC) cycles. Most studies reported DQ improvements using multiple interventions, which included either DQ reporting and personalized feedback (24/39, 61\%), IT-related solutions (21/39, 54\%), training (17/39, 44\%), improvements in workflows (5/39, 13\%), or data cleaning (3/39, 8\%). Most studies reported improvements in DQ through a combination of these interventions. Statistical methods were used to determine significance of treatment effect (22/39, 56\% times), but only 1 study implemented a randomized controlled study design. Variability in study designs, approaches to delivering interventions, and reporting DQ changes hindered a robust meta-analysis of treatment effects. Conclusions: There is an urgent need for standardized guidelines in DQ improvement research to enable comparison and effective synthesis of lessons learned. Frameworks such as PDSA learning cycles and the DAMA DQ framework can facilitate this unmet need. In addition, DQ improvement studies can also benefit from prioritizing root cause analysis of DQ issues to ensure the most appropriate intervention is implemented, thereby ensuring long-term, sustainable improvement. Despite the rise in DQ improvement studies in the last decade, significant heterogeneity in methodologies and reporting remains a challenge. Adopting standardized frameworks for DQ assessment, analysis, and improvement can enhance the effectiveness, comparability, and generalizability of DQ improvement initiatives. ", doi="10.2196/57615", url="https://www.jmir.org/2024/1/e57615" } @Article{info:doi/10.2196/48320, author="Swinckels, Laura and Bennis, C. Frank and Ziesemer, A. Kirsten and Scheerman, M. Janneke F. and Bijwaard, Harmen and de Keijzer, Ander and Bruers, Jan Josef", title="The Use of Deep Learning and Machine Learning on Longitudinal Electronic Health Records for the Early Detection and Prevention of Diseases: Scoping Review", journal="J Med Internet Res", year="2024", month="Aug", day="20", volume="26", pages="e48320", keywords="artificial intelligence", keywords="big data", keywords="detection", keywords="electronic health records", keywords="machine learning", keywords="personalized health care", keywords="prediction", keywords="prevention", abstract="Background: Electronic health records (EHRs) contain patients' health information over time, including possible early indicators of disease. However, the increasing amount of data hinders clinicians from using them. There is accumulating evidence suggesting that machine learning (ML) and deep learning (DL) can assist clinicians in analyzing these large-scale EHRs, as algorithms thrive on high volumes of data. Although ML has become well developed, studies mainly focus on engineering but lack medical outcomes. Objective: This study aims for a scoping review of the evidence on how the use of ML on longitudinal EHRs can support the early detection and prevention of disease. The medical insights and clinical benefits that have been generated were investigated by reviewing applications in a variety of diseases. Methods: This study was conducted according to the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines. A literature search was performed in 2022 in collaboration with a medical information specialist in the following databases: PubMed, Embase, Web of Science Core Collection (Clarivate Analytics), and IEEE Xplore Digital Library and computer science bibliography. Studies were eligible when longitudinal EHRs were used that aimed for the early detection of disease via ML in a prevention context. Studies with a technical focus or using imaging or hospital admission data were beyond the scope of this review. Study screening and selection and data extraction were performed independently by 2 researchers. Results: In total, 20 studies were included, mainly published between 2018 and 2022. They showed that a variety of diseases could be detected or predicted, particularly diabetes; kidney diseases; diseases of the circulatory system; and mental, behavioral, and neurodevelopmental disorders. Demographics, symptoms, procedures, laboratory test results, diagnoses, medications, and BMI were frequently used EHR data in basic recurrent neural network or long short-term memory techniques. By developing and comparing ML and DL models, medical insights such as a high diagnostic performance, an earlier detection, the most important predictors, and additional health indicators were obtained. A clinical benefit that has been evaluated positively was preliminary screening. If these models are applied in practice, patients might also benefit from personalized health care and prevention, with practical benefits such as workload reduction and policy insights. Conclusions: Longitudinal EHRs proved to be helpful for support in health care. Current ML models on EHRs can support the detection of diseases in terms of accuracy and offer preliminary screening benefits. Regarding the prevention of diseases, ML and specifically DL models can accurately predict or detect diseases earlier than current clinical diagnoses. Adding personally responsible factors allows targeted prevention interventions. While ML models based on textual EHRs are still in the developmental stage, they have high potential to support clinicians and the health care system and improve patient outcomes. ", doi="10.2196/48320", url="https://www.jmir.org/2024/1/e48320", url="http://www.ncbi.nlm.nih.gov/pubmed/39163096" } @Article{info:doi/10.2196/56673, author="Teodorowski, Piotr and Jones, Elisa and Tahir, Naheed and Ahmed, Saiqa and Rodgers, E. Sarah and Frith, Lucy", title="Public Involvement and Engagement in Big Data Research: Scoping Review", journal="J Particip Med", year="2024", month="Aug", day="16", volume="16", pages="e56673", keywords="patient and public involvement", keywords="PPI", keywords="involvement", keywords="engagement", keywords="big data", keywords="data science", keywords="patient engagement", keywords="co-design", keywords="coproduction", abstract="Background: The success of big data initiatives depends on public support. Public involvement and engagement could be a way of establishing public support for big data research. Objective: This review aims to synthesize the evidence on public involvement and engagement in big data research. Methods: This scoping review mapped the current evidence on public involvement and engagement activities in big data research. We searched 5 electronic databases, followed by additional manual searches of Google Scholar and gray literature. In total, 2 public contributors were involved at all stages of the review. Results: A total of 53 papers were included in the scoping review. The review showed the ways in which the public could be involved and engaged in big data research. The papers discussed a broad range of involvement activities, who could be involved or engaged, and the importance of the context in which public involvement and engagement occur. The findings show how public involvement, engagement, and consultation could be delivered in big data research. Furthermore, the review provides examples of potential outcomes that were produced by involving and engaging the public in big data research. Conclusions: This review provides an overview of the current evidence on public involvement and engagement in big data research. While the evidence is mostly derived from discussion papers, it is still valuable in illustrating how public involvement and engagement in big data research can be implemented and what outcomes they may yield. Further research and evaluation of public involvement and engagement in big data research are needed to better understand how to effectively involve and engage the public in big data research. International Registered Report Identifier (IRRID): RR2-https://doi.org/10.1136/bmjopen-2021-050167 ", doi="10.2196/56673", url="https://jopm.jmir.org/2024/1/e56673" } @Article{info:doi/10.2196/58548, author="Julian, Silva Guilherme and Shau, Wen-Yi and Chou, Hsu-Wen and Setia, Sajita", title="Bridging Real-World Data Gaps: Connecting Dots Across 10 Asian Countries", journal="JMIR Med Inform", year="2024", month="Aug", day="15", volume="12", pages="e58548", keywords="Asia", keywords="electronic medical records", keywords="EMR", keywords="health care databases", keywords="health technology assessment", keywords="HTA", keywords="real-world data", keywords="real-world evidence", doi="10.2196/58548", url="https://medinform.jmir.org/2024/1/e58548", url="http://www.ncbi.nlm.nih.gov/pubmed/39026427" } @Article{info:doi/10.2196/49542, author="Fruchart, Mathilde and Quindroit, Paul and Jacquemont, Chlo{\'e} and Beuscart, Jean-Baptiste and Calafiore, Matthieu and Lamer, Antoine", title="Transforming Primary Care Data Into the Observational Medical Outcomes Partnership Common Data Model: Development and Usability Study", journal="JMIR Med Inform", year="2024", month="Aug", day="13", volume="12", pages="e49542", keywords="data reuse", keywords="Observational Medical Outcomes Partnership", keywords="common data model", keywords="data warehouse", keywords="reproducible research", keywords="primary care", keywords="dashboard", keywords="electronic health record", keywords="patient tracking system", keywords="patient monitoring", keywords="EHR", keywords="primary care data", abstract="Background: Patient-monitoring software generates a large amount of data that can be reused for clinical audits and scientific research. The Observational Health Data Sciences and Informatics (OHDSI) consortium developed the Observational Medical Outcomes Partnership (OMOP) Common Data Model (CDM) to standardize electronic health record data and promote large-scale observational and longitudinal research. Objective: This study aimed to transform primary care data into the OMOP CDM format. Methods: We extracted primary care data from electronic health records at a multidisciplinary health center in Wattrelos, France. We performed structural mapping between the design of our local primary care database and the OMOP CDM tables and fields. Local French vocabularies concepts were mapped to OHDSI standard vocabularies. To validate the implementation of primary care data into the OMOP CDM format, we applied a set of queries. A practical application was achieved through the development of a dashboard. Results: Data from 18,395 patients were implemented into the OMOP CDM, corresponding to 592,226 consultations over a period of 20 years. A total of 18 OMOP CDM tables were implemented. A total of 17 local vocabularies were identified as being related to primary care and corresponded to patient characteristics (sex, location, year of birth, and race), units of measurement, biometric measures, laboratory test results, medical histories, and drug prescriptions. During semantic mapping, 10,221 primary care concepts were mapped to standard OHDSI concepts. Five queries were used to validate the OMOP CDM by comparing the results obtained after the completion of the transformations with the results obtained in the source software. Lastly, a prototype dashboard was developed to visualize the activity of the health center, the laboratory test results, and the drug prescription data. Conclusions: Primary care data from a French health care facility have been implemented into the OMOP CDM format. Data concerning demographics, units, measurements, and primary care consultation steps were already available in OHDSI vocabularies. Laboratory test results and drug prescription data were mapped to available vocabularies and structured in the final model. A dashboard application provided health care professionals with feedback on their practice. ", doi="10.2196/49542", url="https://medinform.jmir.org/2024/1/e49542" } @Article{info:doi/10.2196/53369, author="Metsallik, Janek and Draheim, Dirk and Sabic, Zlatan and Novak, Thomas and Ross, Peeter", title="Assessing Opportunities and Barriers to Improving the Secondary Use of Health Care Data at the National Level: Multicase Study in the Kingdom of Saudi Arabia and Estonia", journal="J Med Internet Res", year="2024", month="Aug", day="8", volume="26", pages="e53369", keywords="health data governance", keywords="secondary use", keywords="health information sharing maturity", keywords="large-scale interoperability", keywords="health data stewardship", keywords="health data custodianship", keywords="health information purpose", keywords="health data policy", abstract="Background: Digitization shall improve the secondary use of health care data. The Government of the Kingdom of Saudi Arabia ordered a project to compile the National Master Plan for Health Data Analytics, while the Government of Estonia ordered a project to compile the Person-Centered Integrated Hospital Master Plan. Objective: This study aims to map these 2 distinct projects' problems, approaches, and outcomes to find the matching elements for reuse in similar cases. Methods: We assessed both health care systems' abilities for secondary use of health data by exploratory case studies with purposive sampling and data collection via semistructured interviews and documentation review. The collected content was analyzed qualitatively and coded according to a predefined framework. The analytical framework consisted of data purpose, flow, and sharing. The Estonian project used the Health Information Sharing Maturity Model from the Mitre Corporation as an additional analytical framework. The data collection and analysis in the Kingdom of Saudi Arabia took place in 2019 and covered health care facilities, public health institutions, and health care policy. The project in Estonia collected its inputs in 2020 and covered health care facilities, patient engagement, public health institutions, health care financing, health care policy, and health technology innovations. Results: In both cases, the assessments resulted in a set of recommendations focusing on the governance of health care data. In the Kingdom of Saudi Arabia, the health care system consists of multiple isolated sectors, and there is a need for an overarching body coordinating data sets, indicators, and reports at the national level. The National Master Plan of Health Data Analytics proposed a set of organizational agreements for proper stewardship. Despite Estonia's national Digital Health Platform, the requirements remain uncoordinated between various data consumers. We recommended reconfiguring the stewardship of the national health data to include multipurpose data use into the scope of interoperability standardization. Conclusions: Proper data governance is the key to improving the secondary use of health data at the national level. The data flows from data providers to data consumers shall be coordinated by overarching stewardship structures and supported by interoperable data custodians. ", doi="10.2196/53369", url="https://www.jmir.org/2024/1/e53369" } @Article{info:doi/10.2196/55657, author="Yuan, Yingchao and Liu, Chang and Guo, Moning and Xin, Zhong and Chen, Guanjie and Yang, Yue and Zheng, Jianpeng and Zang, Bai and Yang, Jinkui", title="Exploring Cancer Incidence Trends by Age and Sex Among 14.14 Million Individuals in China From 2007 to 2021: Population-Based Study", journal="JMIR Public Health Surveill", year="2024", month="Aug", day="7", volume="10", pages="e55657", keywords="cancer", keywords="incidence", keywords="trend", keywords="sex-based", keywords="women", abstract="Background: Sex is a crucial factor in the development, progression, and treatment of cancer, making it vital to examine cancer incidence trends by sex for effective prevention strategies. Objective: This study aimed to assess the incidence of cancer in China between 2007 and 2021, with a focus on sex-based trends. Methods: A population-based cancer registry comprising 14.14 million individuals was maintained between 2007 and 2021 by the Beijing Municipal Health Big Data and Policy Research Center. The age-standardized rates (ASRs) of cancers were calculated using the Segi population. The average annual percentage of change (AAPC) was evaluated using the joinpoint regression model, while the Bayesian age-period-cohort model was used to predict cancer incidence in the next 10 years. Results: From 2007 to 2021, the study included 651,342 incident patients with cancer, of whom 51.2\% (n=333,577) were women. The incidence indicated by the ASR for all cancers combined was 200.8 per 100,000 for women and 184.4 per 100,000 for men. The increase in incidence indicated by AAPC for all malignancies combined significantly increased in women between 2007 and 2021 (AAPC=3.1\%; P<.001), whereas it remained constant in men (AAPC=0.3\%; P=.30). Although the overall incidence of all cancers indicated by AAPC increased in young men (AAPC=3.2\%; P=.01), the greatest increase was observed among young women (AAPC=6.1\%; P<.001). The incidence rate ratio for cancer in women increased among subsequent younger generations compared with patients born in the 1962-1966 cohort. The ASR in women will increase 1.6-fold over the next 10 years, with women having twice the incidence rate of men by 2031. Conclusions: The rising incidence of cancer among women in China has become a growing concern, emphasizing the need for increased efforts in cancer prevention and early screening, especially among young women. ", doi="10.2196/55657", url="https://publichealth.jmir.org/2024/1/e55657" } @Article{info:doi/10.2196/56316, author="Snowdon, Anne and Hussein, Abdulkadir and Danforth, Melissa and Wright, Alexandra and Oakes, Reid", title="Digital Maturity as a Predictor of Quality and Safety Outcomes in US Hospitals: Cross-Sectional Observational Study", journal="J Med Internet Res", year="2024", month="Aug", day="6", volume="26", pages="e56316", keywords="digital health", keywords="readiness", keywords="cross sectional", keywords="observational", keywords="regression", keywords="digital maturity", keywords="association", keywords="associations", keywords="correlation", keywords="correlations", keywords="quality and safety", keywords="hospital performance", keywords="workforce", keywords="health outcomes", keywords="safety", keywords="service", keywords="services", keywords="healthcare system", keywords="healthcare systems", keywords="hospital", keywords="hospitals", abstract="Background: This study demonstrates that digital maturity contributes to strengthened quality and safety performance outcomes in US hospitals. Advanced digital maturity is associated with more digitally enabled work environments with automated flow of data across information systems to enable clinicians and leaders to track quality and safety outcomes. This research illustrates that an advanced digitally enabled workforce is associated with strong safety leadership and culture and better patient health and safety outcomes. Objective: This study aimed to examine the relationship between digital maturity and quality and safety outcomes in US hospitals. Methods: The data sources were hospital safety letter grades as well as quality and safety scores on a continuous scale published by The Leapfrog Group. We used the digital maturity level (measured using the Electronic Medical Record Assessment Model [EMRAM]) of 1026 US hospitals. This was a cross-sectional, observational study. Logistic, linear, and Tweedie regression analyses were used to explore the relationships among The Leapfrog Group's Hospital Safety Grades, individual Leapfrog safety scores, and digital maturity levels classified as advanced or fully developed digital maturity (EMRAM levels 6 and 7) or underdeveloped maturity (EMRAM level 0). Digital maturity was a predictor while controlling for hospital characteristics including teaching status, urban or rural location, hospital size measured by number of beds, whether the hospital was a referral center, and type of hospital ownership as confounding variables. Hospitals were divided into the following 2 groups to compare safety and quality outcomes: hospitals that were digitally advanced and hospitals with underdeveloped digital maturity. Data from The Leapfrog Group's Hospital Safety Grades report published in spring 2019 were matched to the hospitals with completed EMRAM assessments in 2019. Hospital characteristics such as number of hospital beds were obtained from the CMS database. Results: The results revealed that the odds of achieving a higher Leapfrog Group Hospital Safety Grade was statistically significantly higher, by 3.25 times, for hospitals with advanced digital maturity (EMRAM maturity of 6 or 7; odds ratio 3.25, 95\% CI 2.33-4.55). Conclusions: Hospitals with advanced digital maturity had statistically significantly reduced infection rates, reduced adverse events, and improved surgical safety outcomes. The study findings suggest a significant difference in quality and safety outcomes among hospitals with advanced digital maturity compared with hospitals with underdeveloped digital maturity. ", doi="10.2196/56316", url="https://www.jmir.org/2024/1/e56316", url="http://www.ncbi.nlm.nih.gov/pubmed/39106100" } @Article{info:doi/10.2196/56237, author="Amadi, David and Kiwuwa-Muyingo, Sylvia and Bhattacharjee, Tathagata and Taylor, Amelia and Kiragga, Agnes and Ochola, Michael and Kanjala, Chifundo and Gregory, Arofan and Tomlin, Keith and Todd, Jim and Greenfield, Jay", title="Making Metadata Machine-Readable as the First Step to Providing Findable, Accessible, Interoperable, and Reusable Population Health Data: Framework Development and Implementation Study", journal="Online J Public Health Inform", year="2024", month="Aug", day="1", volume="16", pages="e56237", keywords="FAIR data principles", keywords="metadata", keywords="machine-readable metadata", keywords="DDI", keywords="Data Documentation Initiative", keywords="standardization", keywords="JSON-LD", keywords="JavaScript Object Notation for Linked Data", keywords="OMOP CDM", keywords="Observational Medical Outcomes Partnership Common Data Model", keywords="data science", keywords="data models", abstract="Background: Metadata describe and provide context for other data, playing a pivotal role in enabling findability, accessibility, interoperability, and reusability (FAIR) data principles. By providing comprehensive and machine-readable descriptions of digital resources, metadata empower both machines and human users to seamlessly discover, access, integrate, and reuse data or content across diverse platforms and applications. However, the limited accessibility and machine-interpretability of existing metadata for population health data hinder effective data discovery and reuse. Objective: To address these challenges, we propose a comprehensive framework using standardized formats, vocabularies, and protocols to render population health data machine-readable, significantly enhancing their FAIRness and enabling seamless discovery, access, and integration across diverse platforms and research applications. Methods: The framework implements a 3-stage approach. The first stage is Data Documentation Initiative (DDI) integration, which involves leveraging the DDI Codebook metadata and documentation of detailed information for data and associated assets, while ensuring transparency and comprehensiveness. The second stage is Observational Medical Outcomes Partnership (OMOP) Common Data Model (CDM) standardization. In this stage, the data are harmonized and standardized into the OMOP CDM, facilitating unified analysis across heterogeneous data sets. The third stage involves the integration of Schema.org and JavaScript Object Notation for Linked Data (JSON-LD), in which machine-readable metadata are generated using Schema.org entities and embedded within the data using JSON-LD, boosting discoverability and comprehension for both machines and human users. We demonstrated the implementation of these 3 stages using the Integrated Disease Surveillance and Response (IDSR) data from Malawi and Kenya. Results: The implementation of our framework significantly enhanced the FAIRness of population health data, resulting in improved discoverability through seamless integration with platforms such as Google Dataset Search. The adoption of standardized formats and protocols streamlined data accessibility and integration across various research environments, fostering collaboration and knowledge sharing. Additionally, the use of machine-interpretable metadata empowered researchers to efficiently reuse data for targeted analyses and insights, thereby maximizing the overall value of population health resources. The JSON-LD codes are accessible via a GitHub repository and the HTML code integrated with JSON-LD is available on the Implementation Network for Sharing Population Information from Research Entities website. Conclusions: The adoption of machine-readable metadata standards is essential for ensuring the FAIRness of population health data. By embracing these standards, organizations can enhance diverse resource visibility, accessibility, and utility, leading to a broader impact, particularly in low- and middle-income countries. Machine-readable metadata can accelerate research, improve health care decision-making, and ultimately promote better health outcomes for populations worldwide. ", doi="10.2196/56237", url="https://ojphi.jmir.org/2024/1/e56237", url="http://www.ncbi.nlm.nih.gov/pubmed/39088253" } @Article{info:doi/10.2196/49865, author="Bellmann, Louis and Wiederhold, Johannes Alexander and Tr{\"u}be, Leona and Twerenbold, Raphael and {\"U}ckert, Frank and Gottfried, Karl", title="Introducing Attribute Association Graphs to Facilitate Medical Data Exploration: Development and Evaluation Using Epidemiological Study Data", journal="JMIR Med Inform", year="2024", month="Jul", day="24", volume="12", pages="e49865", keywords="data exploration", keywords="cohort studies", keywords="data visualization", keywords="big data", keywords="statistical models", keywords="medical knowledge", keywords="data analysis", keywords="cardiovascular diseases", keywords="usability", abstract="Background: Interpretability and intuitive visualization facilitate medical knowledge generation through big data. In addition, robustness to high-dimensional and missing data is a requirement for statistical approaches in the medical domain. A method tailored to the needs of physicians must meet all the abovementioned criteria. Objective: This study aims to develop an accessible tool for visual data exploration without the need for programming knowledge, adjusting complex parameterizations, or handling missing data. We sought to use statistical analysis using the setting of disease and control cohorts familiar to clinical researchers. We aimed to guide the user by identifying and highlighting data patterns associated with disease and reveal relations between attributes within the data set. Methods: We introduce the attribute association graph, a novel graph structure designed for visual data exploration using robust statistical metrics. The nodes capture frequencies of participant attributes in disease and control cohorts as well as deviations between groups. The edges represent conditional relations between attributes. The graph is visualized using the Neo4j (Neo4j, Inc) data platform and can be interactively explored without the need for technical knowledge. Nodes with high deviations between cohorts and edges of noticeable conditional relationship are highlighted to guide the user during the exploration. The graph is accompanied by a dashboard visualizing variable distributions. For evaluation, we applied the graph and dashboard to the Hamburg City Health Study data set, a large cohort study conducted in the city of Hamburg, Germany. All data structures can be accessed freely by researchers, physicians, and patients. In addition, we developed a user test conducted with physicians incorporating the System Usability Scale, individual questions, and user tasks. Results: We evaluated the attribute association graph and dashboard through an exemplary data analysis of participants with a general cardiovascular disease in the Hamburg City Health Study data set. All results extracted from the graph structure and dashboard are in accordance with findings from the literature, except for unusually low cholesterol levels in participants with cardiovascular disease, which could be induced by medication. In addition, 95\% CIs of Pearson correlation coefficients were calculated for all associations identified during the data analysis, confirming the results. In addition, a user test with 10 physicians assessing the usability of the proposed methods was conducted. A System Usability Scale score of 70.5\% and average successful task completion of 81.4\% were reported. Conclusions: The proposed attribute association graph and dashboard enable intuitive visual data exploration. They are robust to high-dimensional as well as missing data and require no parameterization. The usability for clinicians was confirmed via a user test, and the validity of the statistical results was confirmed by associations known from literature and standard statistical inference. ", doi="10.2196/49865", url="https://medinform.jmir.org/2024/1/e49865" } @Article{info:doi/10.2196/55496, author="Pirmani, Ashkan and Oldenhof, Martijn and Peeters, M. Liesbet and De Brouwer, Edward and Moreau, Yves", title="Accessible Ecosystem for Clinical Research (Federated Learning for Everyone): Development and Usability Study", journal="JMIR Form Res", year="2024", month="Jul", day="17", volume="8", pages="e55496", keywords="federated learning", keywords="multistakeholder collaboration", keywords="real-world data", keywords="integrity", keywords="reliability", keywords="clinical research", keywords="implementation", keywords="inclusivity", keywords="inclusive", keywords="accessible", keywords="ecosystem", keywords="design effectiveness", abstract="Background: The integrity and reliability of clinical research outcomes rely heavily on access to vast amounts of data. However, the fragmented distribution of these data across multiple institutions, along with ethical and regulatory barriers, presents significant challenges to accessing relevant data. While federated learning offers a promising solution to leverage insights from fragmented data sets, its adoption faces hurdles due to implementation complexities, scalability issues, and inclusivity challenges. Objective: This paper introduces Federated Learning for Everyone (FL4E), an accessible framework facilitating multistakeholder collaboration in clinical research. It focuses on simplifying federated learning through an innovative ecosystem-based approach. Methods: The ``degree of federation'' is a fundamental concept of FL4E, allowing for flexible integration of federated and centralized learning models. This feature provides a customizable solution by enabling users to choose the level of data decentralization based on specific health care settings or project needs, making federated learning more adaptable and efficient. By using an ecosystem-based collaborative learning strategy, FL4E encourages a comprehensive platform for managing real-world data, enhancing collaboration and knowledge sharing among its stakeholders. Results: Evaluating FL4E's effectiveness using real-world health care data sets has highlighted its ecosystem-oriented and inclusive design. By applying hybrid models to 2 distinct analytical tasks---classification and survival analysis---within real-world settings, we have effectively measured the ``degree of federation'' across various contexts. These evaluations show that FL4E's hybrid models not only match the performance of fully federated models but also avoid the substantial overhead usually linked with these models. Achieving this balance greatly enhances collaborative initiatives and broadens the scope of analytical possibilities within the ecosystem. Conclusions: FL4E represents a significant step forward in collaborative clinical research by merging the benefits of centralized and federated learning. Its modular ecosystem-based design and the ``degree of federation'' feature make it an inclusive, customizable framework suitable for a wide array of clinical research scenarios, promising to revolutionize the field through improved collaboration and data use. Detailed implementation and analyses are available on the associated GitHub repository. ", doi="10.2196/55496", url="https://formative.jmir.org/2024/1/e55496" } @Article{info:doi/10.2196/55799, author="Wu, Qingxia and Li, Huali and Wang, Yan and Bai, Yan and Wu, Yaping and Yu, Xuan and Li, Xiaodong and Dong, Pei and Xue, Jon and Shen, Dinggang and Wang, Meiyun", title="Evaluating Large Language Models for Automated Reporting and Data Systems Categorization: Cross-Sectional Study", journal="JMIR Med Inform", year="2024", month="Jul", day="17", volume="12", pages="e55799", keywords="Radiology Reporting and Data Systems", keywords="LI-RADS", keywords="Lung-RADS", keywords="O-RADS", keywords="large language model", keywords="ChatGPT", keywords="chatbot", keywords="chatbots", keywords="categorization", keywords="recommendation", keywords="recommendations", keywords="accuracy", abstract="Background: Large language models show promise for improving radiology workflows, but their performance on structured radiological tasks such as Reporting and Data Systems (RADS) categorization remains unexplored. Objective: This study aims to evaluate 3 large language model chatbots---Claude-2, GPT-3.5, and GPT-4---on assigning RADS categories to radiology reports and assess the impact of different prompting strategies. Methods: This cross-sectional study compared 3 chatbots using 30 radiology reports (10 per RADS criteria), using a 3-level prompting strategy: zero-shot, few-shot, and guideline PDF-informed prompts. The cases were grounded in Liver Imaging Reporting \& Data System (LI-RADS) version 2018, Lung CT (computed tomography) Screening Reporting \& Data System (Lung-RADS) version 2022, and Ovarian-Adnexal Reporting \& Data System (O-RADS) magnetic resonance imaging, meticulously prepared by board-certified radiologists. Each report underwent 6 assessments. Two blinded reviewers assessed the chatbots' response at patient-level RADS categorization and overall ratings. The agreement across repetitions was assessed using Fleiss $\kappa$. Results: Claude-2 achieved the highest accuracy in overall ratings with few-shot prompts and guideline PDFs (prompt-2), attaining 57\% (17/30) average accuracy over 6 runs and 50\% (15/30) accuracy with k-pass voting. Without prompt engineering, all chatbots performed poorly. The introduction of a structured exemplar prompt (prompt-1) increased the accuracy of overall ratings for all chatbots. Providing prompt-2 further improved Claude-2's performance, an enhancement not replicated by GPT-4. The interrun agreement was substantial for Claude-2 (k=0.66 for overall rating and k=0.69 for RADS categorization), fair for GPT-4 (k=0.39 for both), and fair for GPT-3.5 (k=0.21 for overall rating and k=0.39 for RADS categorization). All chatbots showed significantly higher accuracy with LI-RADS version 2018 than with Lung-RADS version 2022 and O-RADS (P<.05); with prompt-2, Claude-2 achieved the highest overall rating accuracy of 75\% (45/60) in LI-RADS version 2018. Conclusions: When equipped with structured prompts and guideline PDFs, Claude-2 demonstrated potential in assigning RADS categories to radiology cases according to established criteria such as LI-RADS version 2018. However, the current generation of chatbots lags in accurately categorizing cases based on more recent RADS criteria. ", doi="10.2196/55799", url="https://medinform.jmir.org/2024/1/e55799", url="http://www.ncbi.nlm.nih.gov/pubmed/39018102" } @Article{info:doi/10.2196/47693, author="Heo, Suncheol and Kang, Eun-Ae and Yu, Yong Jae and Kim, Reong Hae and Lee, Suehyun and Kim, Kwangsoo and Hwangbo, Yul and Park, Woong Rae and Shin, Hyunah and Ryu, Kyeongmin and Kim, Chungsoo and Jung, Hyojung and Chegal, Yebin and Lee, Jae-Hyun and Park, Rang Yu", title="Time Series AI Model for Acute Kidney Injury Detection Based on a Multicenter Distributed Research Network: Development and Verification Study", journal="JMIR Med Inform", year="2024", month="Jul", day="5", volume="12", pages="e47693", keywords="adverse drug reaction", keywords="real world data", keywords="multicenter study", keywords="distributed research network", keywords="common data model", keywords="time series AI", keywords="time series", keywords="artificial intelligence", keywords="machine learning", keywords="adverse reaction", keywords="adverse reactions", keywords="detect", keywords="detection", keywords="toxic", keywords="toxicity", keywords="renal", keywords="kidney", keywords="nephrology", keywords="pharmaceutical", keywords="pharmacology", keywords="pharmacy", keywords="pharmaceutics", abstract="Background: Acute kidney injury (AKI) is a marker of clinical deterioration and renal toxicity. While there are many studies offering prediction models for the early detection of AKI, those predicting AKI occurrence using distributed research network (DRN)--based time series data are rare. Objective: In this study, we aimed to detect the early occurrence of AKI by applying an interpretable long short-term memory (LSTM)--based model to hospital electronic health record (EHR)--based time series data in patients who took nephrotoxic drugs using a DRN. Methods: We conducted a multi-institutional retrospective cohort study of data from 6 hospitals using a DRN. For each institution, a patient-based data set was constructed using 5 drugs for AKI, and an interpretable multivariable LSTM (IMV-LSTM) model was used for training. This study used propensity score matching to mitigate differences in demographics and clinical characteristics. Additionally, the temporal attention values of the AKI prediction model's contribution variables were demonstrated for each institution and drug, with differences in highly important feature distributions between the case and control data confirmed using 1-way ANOVA. Results: This study analyzed 8643 and 31,012 patients with and without AKI, respectively, across 6 hospitals. When analyzing the distribution of AKI onset, vancomycin showed an earlier onset (median 12, IQR 5-25 days), and acyclovir was the slowest compared to the other drugs (median 23, IQR 10-41 days). Our temporal deep learning model for AKI prediction performed well for most drugs. Acyclovir had the highest average area under the receiver operating characteristic curve score per drug (0.94), followed by acetaminophen (0.93), vancomycin (0.92), naproxen (0.90), and celecoxib (0.89). Based on the temporal attention values of the variables in the AKI prediction model, verified lymphocytes and calcvancomycin ium had the highest attention, whereas lymphocytes, albumin, and hemoglobin tended to decrease over time, and urine pH and prothrombin time tended to increase. Conclusions: Early surveillance of AKI outbreaks can be achieved by applying an IMV-LSTM based on time series data through an EHR-based DRN. This approach can help identify risk factors and enable early detection of adverse drug reactions when prescribing drugs that cause renal toxicity before AKI occurs. ", doi="10.2196/47693", url="https://medinform.jmir.org/2024/1/e47693" } @Article{info:doi/10.2196/59680, author="Herman Bernardim Andrade, Gabriel and Yada, Shuntaro and Aramaki, Eiji", title="Is Boundary Annotation Necessary? Evaluating Boundary-Free Approaches to Improve Clinical Named Entity Annotation Efficiency: Case Study", journal="JMIR Med Inform", year="2024", month="Jul", day="2", volume="12", pages="e59680", keywords="natural language processing", keywords="named entity recognition", keywords="information extraction", keywords="text annotation", keywords="entity boundaries", keywords="lenient annotation", keywords="case reports", keywords="annotation", keywords="case study", keywords="medical case report", keywords="efficiency", keywords="model", keywords="model performance", keywords="dataset", keywords="Japan", keywords="Japanese", keywords="entity", keywords="clinical domain", keywords="clinical", abstract="Background: Named entity recognition (NER) is a fundamental task in natural language processing. However, it is typically preceded by named entity annotation, which poses several challenges, especially in the clinical domain. For instance, determining entity boundaries is one of the most common sources of disagreements between annotators due to questions such as whether modifiers or peripheral words should be annotated. If unresolved, these can induce inconsistency in the produced corpora, yet, on the other hand, strict guidelines or adjudication sessions can further prolong an already slow and convoluted process. Objective: The aim of this study is to address these challenges by evaluating 2 novel annotation methodologies, lenient span and point annotation, aiming to mitigate the difficulty of precisely determining entity boundaries. Methods: We evaluate their effects through an annotation case study on a Japanese medical case report data set. We compare annotation time, annotator agreement, and the quality of the produced labeling and assess the impact on the performance of an NER system trained on the annotated corpus. Results: We saw significant improvements in the labeling process efficiency, with up to a 25\% reduction in overall annotation time and even a 10\% improvement in annotator agreement compared to the traditional boundary-strict approach. However, even the best-achieved NER model presented some drop in performance compared to the traditional annotation methodology. Conclusions: Our findings demonstrate a balance between annotation speed and model performance. Although disregarding boundary information affects model performance to some extent, this is counterbalanced by significant reductions in the annotator's workload and notable improvements in the speed of the annotation process. These benefits may prove valuable in various applications, offering an attractive compromise for developers and researchers. ", doi="10.2196/59680", url="https://medinform.jmir.org/2024/1/e59680" } @Article{info:doi/10.2196/55118, author="Akiya, Ippei and Ishihara, Takuma and Yamamoto, Keiichi", title="Comparison of Synthetic Data Generation Techniques for Control Group Survival Data in Oncology Clinical Trials: Simulation Study", journal="JMIR Med Inform", year="2024", month="Jun", day="18", volume="12", pages="e55118", keywords="oncology clinical trial", keywords="survival analysis", keywords="synthetic patient data", keywords="machine learning", keywords="SPD", keywords="simulation", abstract="Background: Synthetic patient data (SPD) generation for survival analysis in oncology trials holds significant potential for accelerating clinical development. Various machine learning methods, including classification and regression trees (CART), random forest (RF), Bayesian network (BN), and conditional tabular generative adversarial network (CTGAN), have been used for this purpose, but their performance in reflecting actual patient survival data remains under investigation. Objective: The aim of this study was to determine the most suitable SPD generation method for oncology trials, specifically focusing on both progression-free survival (PFS) and overall survival (OS), which are the primary evaluation end points in oncology trials. To achieve this goal, we conducted a comparative simulation of 4 generation methods, including CART, RF, BN, and the CTGAN, and the performance of each method was evaluated. Methods: Using multiple clinical trial data sets, 1000 data sets were generated by using each method for each clinical trial data set and evaluated as follows: (1) median survival time (MST) of PFS and OS; (2) hazard ratio distance (HRD), which indicates the similarity between the actual survival function and a synthetic survival function; and (3) visual analysis of Kaplan-Meier (KM) plots. Each method's ability to mimic the statistical properties of real patient data was evaluated from these multiple angles. Results: In most simulation cases, CART demonstrated the high percentages of MSTs for synthetic data falling within the 95\% CI range of the MST of the actual data. These percentages ranged from 88.8\% to 98.0\% for PFS and from 60.8\% to 96.1\% for OS. In the evaluation of HRD, CART revealed that HRD values were concentrated at approximately 0.9. Conversely, for the other methods, no consistent trend was observed for either PFS or OS. CART demonstrated better similarity than RF, in that CART caused overfitting and RF (a kind of ensemble learning approach) prevented it. In SPD generation, the statistical properties close to the actual data should be the focus, not a well-generalized prediction model. Both the BN and CTGAN methods cannot accurately reflect the statistical properties of the actual data because small data sets are not suitable. Conclusions: As a method for generating SPD for survival data from small data sets, such as clinical trial data, CART demonstrated to be the most effective method compared to RF, BN, and CTGAN. Additionally, it is possible to improve CART-based generation methods by incorporating feature engineering and other methods in future work. ", doi="10.2196/55118", url="https://medinform.jmir.org/2024/1/e55118" } @Article{info:doi/10.2196/52290, author="Doll, Joy and Anzalone, Jerrod A. and Clarke, Martina and Cooper, Kathryn and Polich, Ann and Siedlik, Jacob", title="A Call for a Health Data--Informed Workforce Among Clinicians", journal="JMIR Med Educ", year="2024", month="Jun", day="17", volume="10", pages="e52290", keywords="health data--informed workforce", keywords="health data", keywords="health informaticist", keywords="data literacy", keywords="workforce development", doi="10.2196/52290", url="https://mededu.jmir.org/2024/1/e52290" } @Article{info:doi/10.2196/53219, author="Soetikno, G. Alan and Lundberg, L. Alexander and Ozer, A. Egon and Wu, A. Scott and Welch, B. Sarah and Mason, Maryann and Liu, Yingxuan and Havey, J. Robert and Murphy, L. Robert and Hawkins, Claudia and Moss, B. Charles and Post, Ann Lori", title="Updated Surveillance Metrics and History of the COVID-19 Pandemic (2020-2023) in the Middle East and North Africa: Longitudinal Trend Analysis", journal="JMIR Public Health Surveill", year="2024", month="Jun", day="12", volume="10", pages="e53219", keywords="SARS-CoV-2", keywords="COVID-19", keywords="Middle East", keywords="North Africa", keywords="Bahrain", keywords="Iran", keywords="Iraq", keywords="Israel", keywords="Jordan", keywords="Kuwait", keywords="Lebanon", keywords="Oman", keywords="Qatar", keywords="Saudi Arabia", keywords="Syria", keywords="the United Arab Emirates", keywords="Yemen", keywords="Algeria", keywords="Djibouti", keywords="Egypt", keywords="Libya", keywords="Morocco", keywords="Tunisia", keywords="pandemic history", keywords="COVID-19 transmission", keywords="speed", keywords="acceleration", keywords="deceleration", keywords="jerk", keywords="dynamic panel", keywords="generalized method of moments", keywords="Arellano-Bond", keywords="7-day lag", abstract="Background: This study updates the COVID-19 pandemic surveillance in the Middle East and North Africa (MENA) we first conducted in 2020 with 2 additional years of data for the region. Objective: The objective of this study is to determine whether the MENA region meets the criteria for moving from a pandemic to endemic. In doing so, this study considers pandemic trends, dynamic and genomic surveillance methods, and region-specific historical context for the pandemic. These considerations continue through the World Health Organization (WHO) declaration of the end of the public health emergency for the COVID-19 pandemic on May 5, 2023. Methods: In addition to updates to traditional surveillance data and dynamic panel estimates from the original study by Post et al, this study used data on sequenced SARS-CoV-2 variants from the Global Initiative on Sharing All Influenza Data (GISAID) to identify the appearance and duration of variants of concern. We used Nextclade nomenclature to collect clade designations from sequences and Pangolin nomenclature for lineage designations of SARS-CoV-2. Finally, we conducted a 1-sided t test to determine whether regional weekly speed of COVID-19 spread was greater than an outbreak threshold of 10. We ran the test iteratively with 6 months of data from September 4, 2020, to May 12, 2023. Results: The speed of COVID-19 spread for the region had remained below the outbreak threshold for 7 continuous months by the time of the WHO declaration. Acceleration and jerk were also low and stable. Although the 1- and 7-day persistence coefficients remained statistically significant and positive, the weekly shift parameters suggested the coefficients had most recently turned negative, meaning the clustering effect of new COVID-19 cases became even smaller in the 2 weeks around the WHO declaration. From December 2021 onward, Omicron was the predominant variant of concern in sequenced viral samples. The rolling t test of the speed of spread equal to 10 became entirely insignificant from October 2022 onward. Conclusions: The COVID-19 pandemic had far-reaching effects on MENA, impacting health care systems, economies, and social well-being. Although COVID-19 continues to circulate in the MENA region, the rate of transmission remained well below the threshold of an outbreak for over 1 year ahead of the WHO declaration. COVID-19 is endemic in the region and no longer reaches the threshold of the pandemic definition. Both standard and enhanced surveillance metrics confirm that the pandemic had transitioned to endemic by the time of the WHO declaration. ", doi="10.2196/53219", url="https://publichealth.jmir.org/2024/1/e53219", url="http://www.ncbi.nlm.nih.gov/pubmed/38568184" } @Article{info:doi/10.2196/56686, author="Shau, Wen-Yi and Santoso, Handoko and Jip, Vincent and Setia, Sajita", title="Integrated Real-World Data Warehouses Across 7 Evolving Asian Health Care Systems: Scoping Review", journal="J Med Internet Res", year="2024", month="Jun", day="11", volume="26", pages="e56686", keywords="Asia", keywords="health care databases", keywords="cross-country comparison", keywords="electronic health records", keywords="electronic medical records", keywords="data warehousing", keywords="information storage and retrieval", keywords="real-world data", keywords="real-world evidence", keywords="registries", keywords="scoping review", abstract="Background: Asia consists of diverse nations with extremely variable health care systems. Integrated real-world data (RWD) research warehouses provide vast interconnected data sets that uphold statistical rigor. Yet, their intricate details remain underexplored, restricting their broader applications. Objective: Building on our previous research that analyzed integrated RWD warehouses in India, Thailand, and Taiwan, this study extends the research to 7 distinct health care systems: Hong Kong, Indonesia, Malaysia, Pakistan, the Philippines, Singapore, and Vietnam. We aimed to map the evolving landscape of RWD, preferences for methodologies, and database use and archetype the health systems based on existing intrinsic capability for RWD generation. Methods: A systematic scoping review methodology was used, centering on contemporary English literature on PubMed (search date: May 9, 2023). Rigorous screening as defined by eligibility criteria identified RWD studies from multiple health care facilities in at least 1 of the 7 target Asian nations. Point estimates and their associated errors were determined for the data collected from eligible studies. Results: Of the 1483 real-world evidence citations identified on May 9, 2023, a total of 369 (24.9\%) fulfilled the requirements for data extraction and subsequent analysis. Singapore, Hong Kong, and Malaysia contributed to ?100 publications, with each country marked by a higher proportion of single-country studies at 51\% (80/157), 66.2\% (86/130), and 50\% (50/100), respectively, and were classified as solo scholars. Indonesia, Pakistan, Vietnam, and the Philippines had fewer publications and a higher proportion of cross-country collaboration studies (CCCSs) at 79\% (26/33), 58\% (18/31), 74\% (20/27), and 86\% (19/22), respectively, and were classified as global collaborators. Collaboration with countries outside the 7 target nations appeared in 84.2\% to 97.7\% of the CCCSs of each nation. Among target nations, Singapore and Malaysia emerged as preferred research partners for other nations. From 2018 to 2023, most nations showed an increasing trend in study numbers, with Vietnam (24.5\%) and Pakistan (21.2\%) leading the growth; the only exception was the Philippines, which declined by --14.5\%. Clinical registry databases were predominant across all CCCSs from every target nation. For single-country studies, Indonesia, Malaysia, and the Philippines favored clinical registries; Singapore had a balanced use of clinical registries and electronic medical or health records, whereas Hong Kong, Pakistan, and Vietnam leaned toward electronic medical or health records. Overall, 89.9\% (310/345) of the studies took >2 years from completion to publication. Conclusions: The observed variations in contemporary RWD publications across the 7 nations in Asia exemplify distinct research landscapes across nations that are partially explained by their diverse economic, clinical, and research settings. Nevertheless, recognizing these variations is pivotal for fostering tailored, synergistic strategies that amplify RWD's potential in guiding future health care research and policy decisions. International Registered Report Identifier (IRRID): RR2-10.2196/43741 ", doi="10.2196/56686", url="https://www.jmir.org/2024/1/e56686", url="http://www.ncbi.nlm.nih.gov/pubmed/38749399" } @Article{info:doi/10.2196/54355, author="Huang, Jiaoling and Qian, Ying and Yan, Yuge and Liang, Hong and Zhao, Laijun", title="Addressing Hospital Overwhelm During the COVID-19 Pandemic by Using a Primary Health Care--Based Integrated Health System: Modeling Study", journal="JMIR Med Inform", year="2024", month="Jun", day="3", volume="12", pages="e54355", keywords="hospital overwhelm", keywords="primary health care", keywords="modeling study", keywords="policy mix", keywords="pandemic", keywords="model", keywords="simulation", keywords="simulations", keywords="integrated", keywords="health system", keywords="hospital", keywords="hospitals", keywords="management", keywords="service", keywords="services", keywords="health systems", keywords="develop", keywords="development", keywords="bed", keywords="beds", keywords="overwhelm", keywords="death", keywords="deaths", keywords="mortality", keywords="primary care", abstract="Background: After strict COVID-19--related restrictions were lifted, health systems globally were overwhelmed. Much has been discussed about how health systems could better prepare for future pandemics; however, primary health care (PHC) has been largely ignored. Objective: We aimed to investigate what combined policies PHC could apply to strengthen the health care system via a bottom-up approach, so as to better respond to a public health emergency. Methods: We developed a system dynamics model to replicate Shanghai's response when COVID-19--related restrictions were lifted. We then simulated an alternative PHC-based integrated health system and tested the following three interventions: first contact in PHC with telemedicine services, recommendation to secondary care, and return to PHC for recovery. Results: The simulation results showed that each selected intervention could alleviate hospital overwhelm. Increasing the rate of first contact in PHC with telemedicine increased hospital bed availability by 6\% to 12\% and reduced the cumulative number of deaths by 35\%. More precise recommendations had a limited impact on hospital overwhelm (<1\%), but the simulation results showed that underrecommendation (rate: 80\%) would result in a 19\% increase in cumulative deaths. Increasing the rate of return to PHC from 5\% to 20\% improved hospital bed availability by 6\% to 16\% and reduced the cumulative number of deaths by 46\%. Moreover, combining all 3 interventions had a multiplier effect; bed availability increased by 683\%, and the cumulative number of deaths dropped by 75\%. Conclusions: Rather than focusing on the allocation of medical resources in secondary care, we determined that an optimal PHC-based integrated strategy would be to have a 60\% rate of first contact in PHC, a 110\% recommendation rate, and a 20\% rate of return to PHC. This could increase health system resilience during public health emergencies. ", doi="10.2196/54355", url="https://medinform.jmir.org/2024/1/e54355" } @Article{info:doi/10.2196/47682, author="Janssen, Anna and Donnelly, Candice and Shaw, Tim", title="A Taxonomy for Health Information Systems", journal="J Med Internet Res", year="2024", month="May", day="31", volume="26", pages="e47682", keywords="eHealth", keywords="digital health", keywords="electronic health data", keywords="data revolution", keywords="actionable data", keywords="mobile phone", doi="10.2196/47682", url="https://www.jmir.org/2024/1/e47682", url="http://www.ncbi.nlm.nih.gov/pubmed/38820575" } @Article{info:doi/10.2196/50204, author="Vall{\'e}e, Alexandre", title="Envisioning the Future of Personalized Medicine: Role and Realities of Digital Twins", journal="J Med Internet Res", year="2024", month="May", day="13", volume="26", pages="e50204", keywords="digital health", keywords="digital twin", keywords="personalized medicine", keywords="prevention", keywords="prediction", keywords="health care system", doi="10.2196/50204", url="https://www.jmir.org/2024/1/e50204", url="http://www.ncbi.nlm.nih.gov/pubmed/38739913" } @Article{info:doi/10.2196/49129, author="Kim, Hoon Seung and Kim, Hyunkyu and Jeong, Hoon Sung and Park, Eun-Cheol", title="Association of the Type of Public Pension With Mental Health Among South Korean Older Adults: Longitudinal Observational Study", journal="JMIR Public Health Surveill", year="2024", month="May", day="2", volume="10", pages="e49129", keywords="depression", keywords="retirement", keywords="contributory public pension", keywords="low-income household", keywords="public health", keywords="mental health", keywords="data", keywords="big data", keywords="longitudinal data", keywords="low income", abstract="Background: As income and health are closely related, retirement is considered undesirable for health. Many studies have shown the association between pension and health, but no research has considered the association between contribution-based public pensions or their types and health. Objective: This study investigates the association between the type of contributory public pension and depressive symptoms among older adults. Methods: We analyzed the data of 4541 older adults who participated in the South Korea Welfare Panel Study (2014-2020). Depressive symptoms were measured using the 11-item Center for Epidemiologic Studies Depression scale. Public pensions in South Korea are classified into specific corporate pensions and national pensions. For subgroup analyses, pensioners were categorized according to the amount of pension received and the proportion of public pension over gross income. Analyses using generalized estimating equations were conducted for longitudinal data. Results: Individuals receiving public pension, regardless of the pension type, demonstrated significantly decreased depressive symptoms (national pension: $\beta$=--.734; P<.001; specific corporate pension: $\beta$=--.775; P=.02). For both pension types, the higher the amount of benefits, the lower were the depression scores. However, this association was absent for those who received the smaller amount among the specific corporate pensioners. In low-income households, the decrease in the depressive symptoms based on the amount of public pension benefits was greater (fourth quartile of national pension: $\beta$=--1.472; P<.001; second and third quartiles of specific corporate pension: $\beta$=--3.646; P<.001). Conclusions: Our study shows that contributory public pension is significantly associated with lower depressive symptoms, and this association is prominent in low-income households. Thus, contributory public pensions may be good income sources for improving the mental health of older adults after retirement. ", doi="10.2196/49129", url="https://publichealth.jmir.org/2024/1/e49129", url="http://www.ncbi.nlm.nih.gov/pubmed/38696246" } @Article{info:doi/10.2196/51354, author="Li, Mingxia and Han, Shuzhe and Liang, Fang and Hu, Chenghuan and Zhang, Buyao and Hou, Qinlan and Zhao, Shuangping", title="Machine Learning for Predicting Risk and Prognosis of Acute Kidney Disease in Critically Ill Elderly Patients During Hospitalization: Internet-Based and Interpretable Model Study", journal="J Med Internet Res", year="2024", month="May", day="1", volume="26", pages="e51354", keywords="acute kidney disease", keywords="AKD", keywords="machine learning", keywords="critically ill patients", keywords="elderly patients", keywords="Shapley additive explanation", keywords="SHAP", abstract="Background: Acute kidney disease (AKD) affects more than half of critically ill elderly patients with acute kidney injury (AKI), which leads to worse short-term outcomes. Objective: We aimed to establish 2 machine learning models to predict the risk and prognosis of AKD in the elderly and to deploy the models as online apps. Methods: Data on elderly patients with AKI (n=3542) and AKD (n=2661) from the Medical Information Mart for Intensive Care IV (MIMIC-IV) database were used to develop 2 models for predicting the AKD risk and in-hospital mortality, respectively. Data collected from Xiangya Hospital of Central South University were for external validation. A bootstrap method was used for internal validation to obtain relatively stable results. We extracted the indicators within 24 hours of the first diagnosis of AKI and the fluctuation range of some indicators, namely delta (day 3 after AKI minus day 1), as features. Six machine learning algorithms were used for modeling; the area under the receiver operating characteristic curve (AUROC), decision curve analysis, and calibration curve for evaluating; Shapley additive explanation (SHAP) analysis for visually interpreting; and the Heroku platform for deploying the best-performing models as web-based apps. Results: For the model of predicting the risk of AKD in elderly patients with AKI during hospitalization, the Light Gradient Boosting Machine (LightGBM) showed the best overall performance in the training (AUROC=0.844, 95\% CI 0.831-0.857), internal validation (AUROC=0.853, 95\% CI 0.841-0.865), and external (AUROC=0.755, 95\% CI 0.699--0.811) cohorts. In addition, LightGBM performed well for the AKD prognostic prediction in the training (AUROC=0.861, 95\% CI 0.843-0.878), internal validation (AUROC=0.868, 95\% CI 0.851-0.885), and external (AUROC=0.746, 95\% CI 0.673-0.820) cohorts. The models deployed as online prediction apps allowed users to predict and provide feedback to submit new data for model iteration. In the importance ranking and correlation visualization of the model's top 10 influencing factors conducted based on the SHAP value, partial dependence plots revealed the optimal cutoff of some interventionable indicators. The top 5 factors predicting the risk of AKD were creatinine on day 3, sepsis, delta blood urea nitrogen (BUN), diastolic blood pressure (DBP), and heart rate, while the top 5 factors determining in-hospital mortality were age, BUN on day 1, vasopressor use, BUN on day 3, and partial pressure of carbon dioxide (PaCO2). Conclusions: We developed and validated 2 online apps for predicting the risk of AKD and its prognostic mortality in elderly patients, respectively. The top 10 factors that influenced the AKD risk and mortality during hospitalization were identified and explained visually, which might provide useful applications for intelligent management and suggestions for future prospective research. ", doi="10.2196/51354", url="https://www.jmir.org/2024/1/e51354", url="http://www.ncbi.nlm.nih.gov/pubmed/38691403" } @Article{info:doi/10.2196/49445, author="Pilgram, Lisa and Meurers, Thierry and Malin, Bradley and Schaeffner, Elke and Eckardt, Kai-Uwe and Prasser, Fabian and ", title="The Costs of Anonymization: Case Study Using Clinical Data", journal="J Med Internet Res", year="2024", month="Apr", day="24", volume="26", pages="e49445", keywords="data sharing", keywords="anonymization", keywords="deidentification", keywords="privacy-utility trade-off", keywords="privacy-enhancing technologies", keywords="medical informatics", keywords="privacy", keywords="anonymized", keywords="security", keywords="identification", keywords="confidentiality", keywords="data science", abstract="Background: Sharing data from clinical studies can accelerate scientific progress, improve transparency, and increase the potential for innovation and collaboration. However, privacy concerns remain a barrier to data sharing. Certain concerns, such as reidentification risk, can be addressed through the application of anonymization algorithms, whereby data are altered so that it is no longer reasonably related to a person. Yet, such alterations have the potential to influence the data set's statistical properties, such that the privacy-utility trade-off must be considered. This has been studied in theory, but evidence based on real-world individual-level clinical data is rare, and anonymization has not broadly been adopted in clinical practice. Objective: The goal of this study is to contribute to a better understanding of anonymization in the real world by comprehensively evaluating the privacy-utility trade-off of differently anonymized data using data and scientific results from the German Chronic Kidney Disease (GCKD) study. Methods: The GCKD data set extracted for this study consists of 5217 records and 70 variables. A 2-step procedure was followed to determine which variables constituted reidentification risks. To capture a large portion of the risk-utility space, we decided on risk thresholds ranging from 0.02 to 1. The data were then transformed via generalization and suppression, and the anonymization process was varied using a generic and a use case--specific configuration. To assess the utility of the anonymized GCKD data, general-purpose metrics (ie, data granularity and entropy), as well as use case--specific metrics (ie, reproducibility), were applied. Reproducibility was assessed by measuring the overlap of the 95\% CI lengths between anonymized and original results. Results: Reproducibility measured by 95\% CI overlap was higher than utility obtained from general-purpose metrics. For example, granularity varied between 68.2\% and 87.6\%, and entropy varied between 25.5\% and 46.2\%, whereas the average 95\% CI overlap was above 90\% for all risk thresholds applied. A nonoverlapping 95\% CI was detected in 6 estimates across all analyses, but the overwhelming majority of estimates exhibited an overlap over 50\%. The use case--specific configuration outperformed the generic one in terms of actual utility (ie, reproducibility) at the same level of privacy. Conclusions: Our results illustrate the challenges that anonymization faces when aiming to support multiple likely and possibly competing uses, while use case--specific anonymization can provide greater utility. This aspect should be taken into account when evaluating the associated costs of anonymized data and attempting to maintain sufficiently high levels of privacy for anonymized data. Trial Registration: German Clinical Trials Register DRKS00003971; https://drks.de/search/en/trial/DRKS00003971 International Registered Report Identifier (IRRID): RR2-10.1093/ndt/gfr456 ", doi="10.2196/49445", url="https://www.jmir.org/2024/1/e49445", url="http://www.ncbi.nlm.nih.gov/pubmed/38657232" } @Article{info:doi/10.2196/49646, author="Abu Attieh, Hammam and Neves, Telmo Diogo and Guedes, Mariana and Mirandola, Massimo and Dellacasa, Chiara and Rossi, Elisa and Prasser, Fabian", title="A Scalable Pseudonymization Tool for Rapid Deployment in Large Biomedical Research Networks: Development and Evaluation Study", journal="JMIR Med Inform", year="2024", month="Apr", day="23", volume="12", pages="e49646", keywords="biomedical research", keywords="research network", keywords="data sharing", keywords="data protection", keywords="privacy", keywords="pseudonymization", abstract="Background: The SARS-CoV-2 pandemic has demonstrated once again that rapid collaborative research is essential for the future of biomedicine. Large research networks are needed to collect, share, and reuse data and biosamples to generate collaborative evidence. However, setting up such networks is often complex and time-consuming, as common tools and policies are needed to ensure interoperability and the required flows of data and samples, especially for handling personal data and the associated data protection issues. In biomedical research, pseudonymization detaches directly identifying details from biomedical data and biosamples and connects them using secure identifiers, the so-called pseudonyms. This protects privacy by design but allows the necessary linkage and reidentification. Objective: Although pseudonymization is used in almost every biomedical study, there are currently no pseudonymization tools that can be rapidly deployed across many institutions. Moreover, using centralized services is often not possible, for example, when data are reused and consent for this type of data processing is lacking. We present the ORCHESTRA Pseudonymization Tool (OPT), developed under the umbrella of the ORCHESTRA consortium, which faced exactly these challenges when it came to rapidly establishing a large-scale research network in the context of the rapid pandemic response in Europe. Methods: To overcome challenges caused by the heterogeneity of IT infrastructures across institutions, the OPT was developed based on programmable runtime environments available at practically every institution: office suites. The software is highly configurable and provides many features, from subject and biosample registration to record linkage and the printing of machine-readable codes for labeling biosample tubes. Special care has been taken to ensure that the algorithms implemented are efficient so that the OPT can be used to pseudonymize large data sets, which we demonstrate through a comprehensive evaluation. Results: The OPT is available for Microsoft Office and LibreOffice, so it can be deployed on Windows, Linux, and MacOS. It provides multiuser support and is configurable to meet the needs of different types of research projects. Within the ORCHESTRA research network, the OPT has been successfully deployed at 13 institutions in 11 countries in Europe and beyond. As of June 2023, the software manages data about more than 30,000 subjects and 15,000 biosamples. Over 10,000 labels have been printed. The results of our experimental evaluation show that the OPT offers practical response times for all major functionalities, pseudonymizing 100,000 subjects in 10 seconds using Microsoft Excel and in 54 seconds using LibreOffice. Conclusions: Innovative solutions are needed to make the process of establishing large research networks more efficient. The OPT, which leverages the runtime environment of common office suites, can be used to rapidly deploy pseudonymization and biosample management capabilities across research networks. The tool is highly configurable and available as open-source software. ", doi="10.2196/49646", url="https://medinform.jmir.org/2024/1/e49646" } @Article{info:doi/10.2196/53241, author="Karimian Sichani, Elnaz and Smith, Aaron and El Emam, Khaled and Mosquera, Lucy", title="Creating High-Quality Synthetic Health Data: Framework for Model Development and Validation", journal="JMIR Form Res", year="2024", month="Apr", day="22", volume="8", pages="e53241", keywords="synthetic data", keywords="tensor decomposition", keywords="data sharing", keywords="data utility", keywords="data privacy", keywords="electronic health record", keywords="longitudinal", keywords="model development", keywords="model validation", keywords="generative models", abstract="Background: Electronic health records are a valuable source of patient information that must be properly deidentified before being shared with researchers. This process requires expertise and time. In addition, synthetic data have considerably reduced the restrictions on the use and sharing of real data, allowing researchers to access it more rapidly with far fewer privacy constraints. Therefore, there has been a growing interest in establishing a method to generate synthetic data that protects patients' privacy while properly reflecting the data. Objective: This study aims to develop and validate a model that generates valuable synthetic longitudinal health data while protecting the privacy of the patients whose data are collected. Methods: We investigated the best model for generating synthetic health data, with a focus on longitudinal observations. We developed a generative model that relies on the generalized canonical polyadic (GCP) tensor decomposition. This model also involves sampling from a latent factor matrix of GCP decomposition, which contains patient factors, using sequential decision trees, copula, and Hamiltonian Monte Carlo methods. We applied the proposed model to samples from the MIMIC-III (version 1.4) data set. Numerous analyses and experiments were conducted with different data structures and scenarios. We assessed the similarity between our synthetic data and the real data by conducting utility assessments. These assessments evaluate the structure and general patterns present in the data, such as dependency structure, descriptive statistics, and marginal distributions. Regarding privacy disclosure, our model preserves privacy by preventing the direct sharing of patient information and eliminating the one-to-one link between the observed and model tensor records. This was achieved by simulating and modeling a latent factor matrix of GCP decomposition associated with patients. Results: The findings show that our model is a promising method for generating synthetic longitudinal health data that is similar enough to real data. It can preserve the utility and privacy of the original data while also handling various data structures and scenarios. In certain experiments, all simulation methods used in the model produced the same high level of performance. Our model is also capable of addressing the challenge of sampling patients from electronic health records. This means that we can simulate a variety of patients in the synthetic data set, which may differ in number from the patients in the original data. Conclusions: We have presented a generative model for producing synthetic longitudinal health data. The model is formulated by applying the GCP tensor decomposition. We have provided 3 approaches for the synthesis and simulation of a latent factor matrix following the process of factorization. In brief, we have reduced the challenge of synthesizing massive longitudinal health data to synthesizing a nonlongitudinal and significantly smaller data set. ", doi="10.2196/53241", url="https://formative.jmir.org/2024/1/e53241", url="http://www.ncbi.nlm.nih.gov/pubmed/38648097" } @Article{info:doi/10.2196/47125, author="Wang, Echo H. and Weiner, P. Jonathan and Saria, Suchi and Kharrazi, Hadi", title="Evaluating Algorithmic Bias in 30-Day Hospital Readmission Models: Retrospective Analysis", journal="J Med Internet Res", year="2024", month="Apr", day="18", volume="26", pages="e47125", keywords="algorithmic bias", keywords="model bias", keywords="predictive models", keywords="model fairness", keywords="health disparity", keywords="hospital readmission", keywords="retrospective analysis", abstract="Background: The adoption of predictive algorithms in health care comes with the potential for algorithmic bias, which could exacerbate existing disparities. Fairness metrics have been proposed to measure algorithmic bias, but their application to real-world tasks is limited. Objective: This study aims to evaluate the algorithmic bias associated with the application of common 30-day hospital readmission models and assess the usefulness and interpretability of selected fairness metrics. Methods: We used 10.6 million adult inpatient discharges from Maryland and Florida from 2016 to 2019 in this retrospective study. Models predicting 30-day hospital readmissions were evaluated: LACE Index, modified HOSPITAL score, and modified Centers for Medicare \& Medicaid Services (CMS) readmission measure, which were applied as-is (using existing coefficients) and retrained (recalibrated with 50\% of the data). Predictive performances and bias measures were evaluated for all, between Black and White populations, and between low- and other-income groups. Bias measures included the parity of false negative rate (FNR), false positive rate (FPR), 0-1 loss, and generalized entropy index. Racial bias represented by FNR and FPR differences was stratified to explore shifts in algorithmic bias in different populations. Results: The retrained CMS model demonstrated the best predictive performance (area under the curve: 0.74 in Maryland and 0.68-0.70 in Florida), and the modified HOSPITAL score demonstrated the best calibration (Brier score: 0.16-0.19 in Maryland and 0.19-0.21 in Florida). Calibration was better in White (compared to Black) populations and other-income (compared to low-income) groups, and the area under the curve was higher or similar in the Black (compared to White) populations. The retrained CMS and modified HOSPITAL score had the lowest racial and income bias in Maryland. In Florida, both of these models overall had the lowest income bias and the modified HOSPITAL score showed the lowest racial bias. In both states, the White and higher-income populations showed a higher FNR, while the Black and low-income populations resulted in a higher FPR and a higher 0-1 loss. When stratified by hospital and population composition, these models demonstrated heterogeneous algorithmic bias in different contexts and populations. Conclusions: Caution must be taken when interpreting fairness measures' face value. A higher FNR or FPR could potentially reflect missed opportunities or wasted resources, but these measures could also reflect health care use patterns and gaps in care. Simply relying on the statistical notions of bias could obscure or underplay the causes of health disparity. The imperfect health data, analytic frameworks, and the underlying health systems must be carefully considered. Fairness measures can serve as a useful routine assessment to detect disparate model performances but are insufficient to inform mechanisms or policy changes. However, such an assessment is an important first step toward data-driven improvement to address existing health disparities. ", doi="10.2196/47125", url="https://www.jmir.org/2024/1/e47125", url="http://www.ncbi.nlm.nih.gov/pubmed/38422347" } @Article{info:doi/10.2196/38170, author="Qian, Weicheng and Cooke, Aranock and Stanley, Gordon Kevin and Osgood, David Nathaniel", title="Comparing Contact Tracing Through Bluetooth and GPS Surveillance Data: Simulation-Driven Approach", journal="J Med Internet Res", year="2024", month="Apr", day="17", volume="26", pages="e38170", keywords="smartphone-based sensing", keywords="proximity contact data", keywords="transmission models", keywords="agent-based simulation", keywords="health informatics", keywords="mobile phone", abstract="Background: Accurate and responsive epidemiological simulations of epidemic outbreaks inform decision-making to mitigate the impact of pandemics. These simulations must be grounded in quantities derived from measurements, among which the parameters associated with contacts between individuals are notoriously difficult to estimate. Digital contact tracing data, such as those provided by Bluetooth beaconing or GPS colocating, can provide more precise measures of contact than traditional methods based on direct observation or self-reporting. Both measurement modalities have shortcomings and are prone to false positives or negatives, as unmeasured environmental influences bias the data. Objective: We aim to compare GPS colocated versus Bluetooth beacon--derived proximity contact data for their impacts on transmission models' results under community and types of diseases. Methods: We examined the contact patterns derived from 3 data sets collected in 2016, with participants comprising students and staff from the University of Saskatchewan in Canada. Each of these 3 data sets used both Bluetooth beaconing and GPS localization on smartphones running the Ethica Data (Avicenna Research) app to collect sensor data about every 5 minutes over a month. We compared the structure of contact networks inferred from proximity contact data collected with the modalities of GPS colocating and Bluetooth beaconing. We assessed the impact of sensing modalities on the simulation results of transmission models informed by proximate contacts derived from sensing data. Specifically, we compared the incidence number, attack rate, and individual infection risks across simulation results of agent-based susceptible-exposed-infectious-removed transmission models of 4 different contagious diseases. We have demonstrated their differences with violin plots, 2-tailed t tests, and Kullback-Leibler divergence. Results: Both network structure analyses show visually salient differences in proximity contact data collected between GPS colocating and Bluetooth beaconing, regardless of the underlying population. Significant differences were found for the estimated attack rate based on distance threshold, measurement modality, and simulated disease. This finding demonstrates that the sensor modality used to trace contact can have a significant impact on the expected propagation of a disease through a population. The violin plots of attack rate and Kullback-Leibler divergence of individual infection risks demonstrated discernible differences for different sensing modalities, regardless of the underlying population and diseases. The results of the t tests on attack rate between different sensing modalities were mostly significant (P<.001). Conclusions: We show that the contact networks generated from these 2 measurement modalities are different and generate significantly different attack rates across multiple data sets and pathogens. While both modalities offer higher-resolution portraits of contact behavior than is possible with most traditional contact measures, the differential impact of measurement modality on the simulation outcome cannot be ignored and must be addressed in studies only using a single measure of contact in the future. ", doi="10.2196/38170", url="https://www.jmir.org/2024/1/e38170", url="http://www.ncbi.nlm.nih.gov/pubmed/38422493" } @Article{info:doi/10.2196/48330, author="Ke, Yuhe and Yang, Rui and Liu, Nan", title="Comparing Open-Access Database and Traditional Intensive Care Studies Using Machine Learning: Bibliometric Analysis Study", journal="J Med Internet Res", year="2024", month="Apr", day="17", volume="26", pages="e48330", keywords="BERTopic", keywords="critical care", keywords="eICU", keywords="machine learning", keywords="MIMIC", keywords="Medical Information Mart for Intensive Care", keywords="natural language processing", abstract="Background: Intensive care research has predominantly relied on conventional methods like randomized controlled trials. However, the increasing popularity of open-access, free databases in the past decade has opened new avenues for research, offering fresh insights. Leveraging machine learning (ML) techniques enables the analysis of trends in a vast number of studies. Objective: This study aims to conduct a comprehensive bibliometric analysis using ML to compare trends and research topics in traditional intensive care unit (ICU) studies and those done with open-access databases (OADs). Methods: We used ML for the analysis of publications in the Web of Science database in this study. Articles were categorized into ``OAD'' and ``traditional intensive care'' (TIC) studies. OAD studies were included in the Medical Information Mart for Intensive Care (MIMIC), eICU Collaborative Research Database (eICU-CRD), Amsterdam University Medical Centers Database (AmsterdamUMCdb), High Time Resolution ICU Dataset (HiRID), and Pediatric Intensive Care database. TIC studies included all other intensive care studies. Uniform manifold approximation and projection was used to visualize the corpus distribution. The BERTopic technique was used to generate 30 topic-unique identification numbers and to categorize topics into 22 topic families. Results: A total of 227,893 records were extracted. After exclusions, 145,426 articles were identified as TIC and 1301 articles as OAD studies. TIC studies experienced exponential growth over the last 2 decades, culminating in a peak of 16,378 articles in 2021, while OAD studies demonstrated a consistent upsurge since 2018. Sepsis, ventilation-related research, and pediatric intensive care were the most frequently discussed topics. TIC studies exhibited broader coverage than OAD studies, suggesting a more extensive research scope. Conclusions: This study analyzed ICU research, providing valuable insights from a large number of publications. OAD studies complement TIC studies, focusing on predictive modeling, while TIC studies capture essential qualitative information. Integrating both approaches in a complementary manner is the future direction for ICU research. Additionally, natural language processing techniques offer a transformative alternative for literature review and bibliometric analysis. ", doi="10.2196/48330", url="https://www.jmir.org/2024/1/e48330", url="http://www.ncbi.nlm.nih.gov/pubmed/38630522" } @Article{info:doi/10.2196/55794, author="Nishioka, Satoshi and Watabe, Satoshi and Yanagisawa, Yuki and Sayama, Kyoko and Kizaki, Hayato and Imai, Shungo and Someya, Mitsuhiro and Taniguchi, Ryoo and Yada, Shuntaro and Aramaki, Eiji and Hori, Satoko", title="Adverse Event Signal Detection Using Patients' Concerns in Pharmaceutical Care Records: Evaluation of Deep Learning Models", journal="J Med Internet Res", year="2024", month="Apr", day="16", volume="26", pages="e55794", keywords="cancer", keywords="anticancer drug", keywords="adverse event", keywords="side effect", keywords="patient-reported outcome", keywords="patients' voice", keywords="patient-oriented", keywords="patient narrative", keywords="natural language processing", keywords="deep learning", keywords="pharmaceutical care record", keywords="SOAP", abstract="Background: Early detection of adverse events and their management are crucial to improving anticancer treatment outcomes, and listening to patients' subjective opinions (patients' voices) can make a major contribution to improving safety management. Recent progress in deep learning technologies has enabled various new approaches for the evaluation of safety-related events based on patient-generated text data, but few studies have focused on the improvement of real-time safety monitoring for individual patients. In addition, no study has yet been performed to validate deep learning models for screening patients' narratives for clinically important adverse event signals that require medical intervention. In our previous work, novel deep learning models have been developed to detect adverse event signals for hand-foot syndrome or adverse events limiting patients' daily lives from the authored narratives of patients with cancer, aiming ultimately to use them as safety monitoring support tools for individual patients. Objective: This study was designed to evaluate whether our deep learning models can screen clinically important adverse event signals that require intervention by health care professionals. The applicability of our deep learning models to data on patients' concerns at pharmacies was also assessed. Methods: Pharmaceutical care records at community pharmacies were used for the evaluation of our deep learning models. The records followed the SOAP format, consisting of subjective (S), objective (O), assessment (A), and plan (P) columns. Because of the unique combination of patients' concerns in the S column and the professional records of the pharmacists, this was considered a suitable data for the present purpose. Our deep learning models were applied to the S records of patients with cancer, and the extracted adverse event signals were assessed in relation to medical actions and prescribed drugs. Results: From 30,784 S records of 2479 patients with at least 1 prescription of anticancer drugs, our deep learning models extracted true adverse event signals with more than 80\% accuracy for both hand-foot syndrome (n=152, 91\%) and adverse events limiting patients' daily lives (n=157, 80.1\%). The deep learning models were also able to screen adverse event signals that require medical intervention by health care providers. The extracted adverse event signals could reflect the side effects of anticancer drugs used by the patients based on analysis of prescribed anticancer drugs. ``Pain or numbness'' (n=57, 36.3\%), ``fever'' (n=46, 29.3\%), and ``nausea'' (n=40, 25.5\%) were common symptoms out of the true adverse event signals identified by the model for adverse events limiting patients' daily lives. Conclusions: Our deep learning models were able to screen clinically important adverse event signals that require intervention for symptoms. It was also confirmed that these deep learning models could be applied to patients' subjective information recorded in pharmaceutical care records accumulated during pharmacists' daily work. ", doi="10.2196/55794", url="https://www.jmir.org/2024/1/e55794", url="http://www.ncbi.nlm.nih.gov/pubmed/38625718" } @Article{info:doi/10.2196/55779, author="Tsafnat, Guy and Dunscombe, Rachel and Gabriel, Davera and Grieve, Grahame and Reich, Christian", title="Converge or Collide? Making Sense of a Plethora of Open Data Standards in Health Care", journal="J Med Internet Res", year="2024", month="Apr", day="9", volume="26", pages="e55779", keywords="interoperability", keywords="clinical data", keywords="open data standards", keywords="health care", keywords="digital health", keywords="health care data", doi="10.2196/55779", url="https://www.jmir.org/2024/1/e55779", url="http://www.ncbi.nlm.nih.gov/pubmed/38593431" } @Article{info:doi/10.2196/53400, author="Seo, Hyeram and Ahn, Imjin and Gwon, Hansle and Kang, Heejun and Kim, Yunha and Choi, Heejung and Kim, Minkyoung and Han, Jiye and Kee, Gaeun and Park, Seohyun and Ko, Soyoung and Jung, HyoJe and Kim, Byeolhee and Oh, Jungsik and Jun, Joon Tae and Kim, Young-Hak", title="Forecasting Hospital Room and Ward Occupancy Using Static and Dynamic Information Concurrently: Retrospective Single-Center Cohort Study", journal="JMIR Med Inform", year="2024", month="Mar", day="21", volume="12", pages="e53400", keywords="hospital bed occupancy", keywords="electronic medical records", keywords="time series forecasting", keywords="short-term memory", keywords="combining static and dynamic variables", abstract="Background: Predicting the bed occupancy rate (BOR) is essential for efficient hospital resource management, long-term budget planning, and patient care planning. Although macro-level BOR prediction for the entire hospital is crucial, predicting occupancy at a detailed level, such as specific wards and rooms, is more practical and useful for hospital scheduling. Objective: The aim of this study was to develop a web-based support tool that allows hospital administrators to grasp the BOR for each ward and room according to different time periods. Methods: We trained time-series models based on long short-term memory (LSTM) using individual bed data aggregated hourly each day to predict the BOR for each ward and room in the hospital. Ward training involved 2 models with 7- and 30-day time windows, and room training involved models with 3- and 7-day time windows for shorter-term planning. To further improve prediction performance, we added 2 models trained by concatenating dynamic data with static data representing room-specific details. Results: We confirmed the results of a total of 12 models using bidirectional long short-term memory (Bi-LSTM) and LSTM, and the model based on Bi-LSTM showed better performance. The ward-level prediction model had a mean absolute error (MAE) of 0.067, mean square error (MSE) of 0.009, root mean square error (RMSE) of 0.094, and R2 score of 0.544. Among the room-level prediction models, the model that combined static data exhibited superior performance, with a MAE of 0.129, MSE of 0.050, RMSE of 0.227, and R2 score of 0.600. Model results can be displayed on an electronic dashboard for easy access via the web. Conclusions: We have proposed predictive BOR models for individual wards and rooms that demonstrate high performance. The results can be visualized through a web-based dashboard, aiding hospital administrators in bed operation planning. This contributes to resource optimization and the reduction of hospital resource use. ", doi="10.2196/53400", url="https://medinform.jmir.org/2024/1/e53400", url="http://www.ncbi.nlm.nih.gov/pubmed/38513229" } @Article{info:doi/10.2196/47508, author="Guo, Feipeng and Liu, Zixiang and Lu, Qibei and Ji, Shaobo and Zhang, Chen", title="Public Opinion About COVID-19 on a Microblog Platform in China: Topic Modeling and Multidimensional Sentiment Analysis of Social Media", journal="J Med Internet Res", year="2024", month="Jan", day="31", volume="26", pages="e47508", keywords="COVID-19", keywords="social media public opinion", keywords="microblog", keywords="sentiment analysis", keywords="topic modeling", abstract="Background: The COVID-19 pandemic raised wide concern from all walks of life globally. Social media platforms became an important channel for information dissemination and an effective medium for public sentiment transmission during the COVID-19 pandemic. Objective: Mining and analyzing social media text information can not only reflect the changes in public sentiment characteristics during the COVID-19 pandemic but also help the government understand the trends in public opinion and reasonably control public opinion. Methods: First, this study collected microblog comments related to the COVID-19 pandemic as a data set. Second, sentiment analysis was carried out based on the topic modeling method combining latent Dirichlet allocation (LDA) and Bidirectional Encoder Representations from Transformers (BERT). Finally, a machine learning logistic regression (ML-LR) model combined with a sparse matrix was proposed to explore the evolutionary trend in public opinion on social media and verify the high accuracy of the model. Results: The experimental results show that, in different stages, the characteristics of public emotion are different, and the overall trend is from negative to positive. Conclusions: The proposed method can effectively reflect the characteristics of the different times and space of public opinion. The results provide theoretical support and practical reference in response to public health and safety events. ", doi="10.2196/47508", url="https://www.jmir.org/2024/1/e47508", url="http://www.ncbi.nlm.nih.gov/pubmed/38294856" } @Article{info:doi/10.2196/53516, author="Koonce, Y. Taneya and Giuse, A. Dario and Williams, M. Annette and Blasingame, N. Mallory and Krump, A. Poppy and Su, Jing and Giuse, B. Nunzia", title="Using a Natural Language Processing Approach to Support Rapid Knowledge Acquisition", journal="JMIR Med Inform", year="2024", month="Jan", day="30", volume="12", pages="e53516", keywords="natural language processing", keywords="electronic health records", keywords="machine learning", keywords="data mining", keywords="knowledge management", keywords="NLP", doi="10.2196/53516", url="https://medinform.jmir.org/2024/1/e53516", url="http://www.ncbi.nlm.nih.gov/pubmed/38289670" } @Article{info:doi/10.2196/49007, author="Mehra, Tarun and Wekhof, Tobias and Keller, Iris Dagmar", title="Additional Value From Free-Text Diagnoses in Electronic Health Records: Hybrid Dictionary and Machine Learning Classification Study", journal="JMIR Med Inform", year="2024", month="Jan", day="17", volume="12", pages="e49007", keywords="electronic health records", keywords="free text", keywords="natural language processing", keywords="NLP", keywords="artificial intelligence", keywords="AI", abstract="Background: Physicians are hesitant to forgo the opportunity of entering unstructured clinical notes for structured data entry in electronic health records. Does free text increase informational value in comparison with structured data? Objective: This study aims to compare information from unstructured text-based chief complaints harvested and processed by a natural language processing (NLP) algorithm with clinician-entered structured diagnoses in terms of their potential utility for automated improvement of patient workflows. Methods: Electronic health records of 293,298 patient visits at the emergency department of a Swiss university hospital from January 2014 to October 2021 were analyzed. Using emergency department overcrowding as a case in point, we compared supervised NLP-based keyword dictionaries of symptom clusters from unstructured clinical notes and clinician-entered chief complaints from a structured drop-down menu with the following 2 outcomes: hospitalization and high Emergency Severity Index (ESI) score. Results: Of 12 symptom clusters, the NLP cluster was substantial in predicting hospitalization in 11 (92\%) clusters; 8 (67\%) clusters remained significant even after controlling for the cluster of clinician-determined chief complaints in the model. All 12 NLP symptom clusters were significant in predicting a low ESI score, of which 9 (75\%) remained significant when controlling for clinician-determined chief complaints. The correlation between NLP clusters and chief complaints was low (r=?0.04 to 0.6), indicating complementarity of information. Conclusions: The NLP-derived features and clinicians' knowledge were complementary in explaining patient outcome heterogeneity. They can provide an efficient approach to patient flow management, for example, in an emergency medicine setting. We further demonstrated the feasibility of creating extensive and precise keyword dictionaries with NLP by medical experts without requiring programming knowledge. Using the dictionary, we could classify short and unstructured clinical texts into diagnostic categories defined by the clinician. ", doi="10.2196/49007", url="https://medinform.jmir.org/2024/1/e49007", url="http://www.ncbi.nlm.nih.gov/pubmed/38231569" } @Article{info:doi/10.2196/42477, author="Bazoge, Adrien and Morin, Emmanuel and Daille, B{\'e}atrice and Gourraud, Pierre-Antoine", title="Applying Natural Language Processing to Textual Data From Clinical Data Warehouses: Systematic Review", journal="JMIR Med Inform", year="2023", month="Dec", day="15", volume="11", pages="e42477", keywords="natural language processing", keywords="data warehousing", keywords="clinical data warehouse", keywords="artificial intelligence", keywords="AI", abstract="Background: In recent years, health data collected during the clinical care process have been often repurposed for secondary use through clinical data warehouses (CDWs), which interconnect disparate data from different sources. A large amount of information of high clinical value is stored in unstructured text format. Natural language processing (NLP), which implements algorithms that can operate on massive unstructured textual data, has the potential to structure the data and make clinical information more accessible. Objective: The aim of this review was to provide an overview of studies applying NLP to textual data from CDWs. It focuses on identifying the (1) NLP tasks applied to data from CDWs and (2) NLP methods used to tackle these tasks. Methods: This review was performed according to the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines. We searched for relevant articles in 3 bibliographic databases: PubMed, Google Scholar, and ACL Anthology. We reviewed the titles and abstracts and included articles according to the following inclusion criteria: (1) focus on NLP applied to textual data from CDWs, (2) articles published between 1995 and 2021, and (3) written in English. Results: We identified 1353 articles, of which 194 (14.34\%) met the inclusion criteria. Among all identified NLP tasks in the included papers, information extraction from clinical text (112/194, 57.7\%) and the identification of patients (51/194, 26.3\%) were the most frequent tasks. To address the various tasks, symbolic methods were the most common NLP methods (124/232, 53.4\%), showing that some tasks can be partially achieved with classical NLP techniques, such as regular expressions or pattern matching that exploit specialized lexica, such as drug lists and terminologies. Machine learning (70/232, 30.2\%) and deep learning (38/232, 16.4\%) have been increasingly used in recent years, including the most recent approaches based on transformers. NLP methods were mostly applied to English language data (153/194, 78.9\%). Conclusions: CDWs are central to the secondary use of clinical texts for research purposes. Although the use of NLP on data from CDWs is growing, there remain challenges in this field, especially with regard to languages other than English. Clinical NLP is an effective strategy for accessing, extracting, and transforming data from CDWs. Information retrieved with NLP can assist in clinical research and have an impact on clinical practice. ", doi="10.2196/42477", url="https://medinform.jmir.org/2023/1/e42477", url="http://www.ncbi.nlm.nih.gov/pubmed/38100200" } @Article{info:doi/10.2196/50017, author="Renner, Christopher and Reimer, Niklas and Christoph, Jan and Busch, Hauke and Metzger, Patrick and Boerries, Melanie and Ustjanzew, Arsenij and Boehm, Dominik and Unberath, Philipp", title="Extending cBioPortal for Therapy Recommendation Documentation in Molecular Tumor Boards: Development and Usability Study", journal="JMIR Med Inform", year="2023", month="Dec", day="11", volume="11", pages="e50017", keywords="molecular tumor board", keywords="documentation platform", keywords="usability evaluation", keywords="cBioPortal", keywords="precision medicine", keywords="genomics", keywords="health information interoperability", keywords="tumor", keywords="implementation", keywords="cancer", keywords="tool", keywords="platform", keywords="development", keywords="precision", keywords="use", keywords="user-centered", abstract="Background: In molecular tumor boards (MTBs), patients with rare or advanced cancers are discussed by a multidisciplinary team of health care professionals. Software support for MTBs is lacking; in particular, tools for preparing and documenting MTB therapy recommendations need to be developed. Objective: We aimed to implement an extension to cBioPortal to provide a tool for the documentation of therapy recommendations from MTB sessions in a secure and standardized manner. The developed extension should be embedded in the patient view of cBioPortal to enable easy documentation during MTB sessions. The resulting architecture for storing therapy recommendations should be integrable into various hospital information systems. Methods: On the basis of a requirements analysis and technology analysis for authentication techniques, a prototype was developed and iteratively refined through a user-centered development process. In conclusion, the tool was evaluated via a usability evaluation, including interviews, structured questionnaires, and the System Usability Scale. Results: The patient view of cBioPortal was extended with a new tab that enables users to document MTB sessions and therapy recommendations. The role-based access control was expanded to allow for a finer distinction among the rights to view, edit, and delete data. The usability evaluation showed overall good usability and a System Usability Scale score of 83.57. Conclusions: This study demonstrates how cBioPortal can be extended to not only visualize MTB patient data but also be used as a documentation platform for therapy recommendations. ", doi="10.2196/50017", url="https://medinform.jmir.org/2023/1/e50017", url="http://www.ncbi.nlm.nih.gov/pubmed/38079196" } @Article{info:doi/10.2196/45815, author="Shi, Jin and Bendig, David and Vollmar, Christian Horst and Rasche, Peter", title="Mapping the Bibliometrics Landscape of AI in Medicine: Methodological Study", journal="J Med Internet Res", year="2023", month="Dec", day="8", volume="25", pages="e45815", keywords="artificial intelligence", keywords="AI", keywords="AI in medicine", keywords="medical AI taxonomy", keywords="Python", keywords="latent Dirichlet allocation", keywords="LDA", keywords="topic modeling", keywords="unsupervised machine learning", abstract="Background: Artificial intelligence (AI), conceived in the 1950s, has permeated numerous industries, intensifying in tandem with advancements in computing power. Despite the widespread adoption of AI, its integration into medicine trails other sectors. However, medical AI research has experienced substantial growth, attracting considerable attention from researchers and practitioners. Objective: In the absence of an existing framework, this study aims to outline the current landscape of medical AI research and provide insights into its future developments by examining all AI-related studies within PubMed over the past 2 decades. We also propose potential data acquisition and analysis methods, developed using Python (version 3.11) and to be executed in Spyder IDE (version 5.4.3), for future analogous research. Methods: Our dual-pronged approach involved (1) retrieving publication metadata related to AI from PubMed (spanning 2000-2022) via Python, including titles, abstracts, authors, journals, country, and publishing years, followed by keyword frequency analysis and (2) classifying relevant topics using latent Dirichlet allocation, an unsupervised machine learning approach, and defining the research scope of AI in medicine. In the absence of a universal medical AI taxonomy, we used an AI dictionary based on the European Commission Joint Research Centre AI Watch report, which emphasizes 8 domains: reasoning, planning, learning, perception, communication, integration and interaction, service, and AI ethics and philosophy. Results: From 2000 to 2022, a comprehensive analysis of 307,701 AI-related publications from PubMed highlighted a 36-fold increase. The United States emerged as a clear frontrunner, producing 68,502 of these articles. Despite its substantial contribution in terms of volume, China lagged in terms of citation impact. Diving into specific AI domains, as the Joint Research Centre AI Watch report categorized, the learning domain emerged dominant. Our classification analysis meticulously traced the nuanced research trajectories across each domain, revealing the multifaceted and evolving nature of AI's application in the realm of medicine. Conclusions: The research topics have evolved as the volume of AI studies increases annually. Machine learning remains central to medical AI research, with deep learning expected to maintain its fundamental role. Empowered by predictive algorithms, pattern recognition, and imaging analysis capabilities, the future of AI research in medicine is anticipated to concentrate on medical diagnosis, robotic intervention, and disease management. Our topic modeling outcomes provide a clear insight into the focus of AI research in medicine over the past decades and lay the groundwork for predicting future directions. The domains that have attracted considerable research attention, primarily the learning domain, will continue to shape the trajectory of AI in medicine. Given the observed growing interest, the domain of AI ethics and philosophy also stands out as a prospective area of increased focus. ", doi="10.2196/45815", url="https://www.jmir.org/2023/1/e45815", url="http://www.ncbi.nlm.nih.gov/pubmed/38064255" } @Article{info:doi/10.2196/53058, author="Lee, Ra Ah and Park, Hojoon and Yoo, Aram and Kim, Seok and Sunwoo, Leonard and Yoo, Sooyoung", title="Risk Prediction of Emergency Department Visits in Patients With Lung Cancer Using Machine Learning: Retrospective Observational Study", journal="JMIR Med Inform", year="2023", month="Dec", day="6", volume="11", pages="e53058", keywords="emergency department", keywords="lung cancer", keywords="risk prediction", keywords="machine learning", keywords="common data model", keywords="emergency", keywords="hospitalization", keywords="hospitalizations", keywords="lung", keywords="cancer", keywords="oncology", keywords="lungs", keywords="pulmonary", keywords="respiratory", keywords="predict", keywords="prediction", keywords="predictions", keywords="predictive", keywords="algorithm", keywords="algorithms", keywords="risk", keywords="risks", keywords="model", keywords="models", abstract="Background: Patients with lung cancer are among the most frequent visitors to emergency departments due to cancer-related problems, and the prognosis for those who seek emergency care is dismal. Given that patients with lung cancer frequently visit health care facilities for treatment or follow-up, the ability to predict emergency department visits based on clinical information gleaned from their routine visits would enhance hospital resource utilization and patient outcomes. Objective: This study proposed a machine learning--based prediction model to identify risk factors for emergency department visits by patients with lung cancer. Methods: This was a retrospective observational study of patients with lung cancer diagnosed at Seoul National University Bundang Hospital, a tertiary general hospital in South Korea, between January 2010 and December 2017. The primary outcome was an emergency department visit within 30 days of an outpatient visit. This study developed a machine learning--based prediction model using a common data model. In addition, the importance of features that influenced the decision-making of the model output was analyzed to identify significant clinical factors. Results: The model with the best performance demonstrated an area under the receiver operating characteristic curve of 0.73 in its ability to predict the attendance of patients with lung cancer in emergency departments. The frequency of recent visits to the emergency department and several laboratory test results that are typically collected during cancer treatment follow-up visits were revealed as influencing factors for the model output. Conclusions: This study developed a machine learning--based risk prediction model using a common data model and identified influencing factors for emergency department visits by patients with lung cancer. The predictive model contributes to the efficiency of resource utilization and health care service quality by facilitating the identification and early intervention of high-risk patients. This study demonstrated the possibility of collaborative research among different institutions using the common data model for precision medicine in lung cancer. ", doi="10.2196/53058", url="https://medinform.jmir.org/2023/1/e53058", url="http://www.ncbi.nlm.nih.gov/pubmed/38055320" } @Article{info:doi/10.2196/47859, author="Kang, Jin Ha Ye and Batbaatar, Erdenebileg and Choi, Dong-Woo and Choi, Son Kui and Ko, Minsam and Ryu, Sun Kwang", title="Synthetic Tabular Data Based on Generative Adversarial Networks in Health Care: Generation and Validation Using the Divide-and-Conquer Strategy", journal="JMIR Med Inform", year="2023", month="Nov", day="24", volume="11", pages="e47859", keywords="generative adversarial networks", keywords="GAN", keywords="synthetic data generation", keywords="synthetic tabular data", keywords="lung cancer", keywords="machine learning", keywords="mortality prediction", abstract="Background: Synthetic data generation (SDG) based on generative adversarial networks (GANs) is used in health care, but research on preserving data with logical relationships with synthetic tabular data (STD) remains challenging. Filtering methods for SDG can lead to the loss of important information. Objective: This study proposed a divide-and-conquer (DC) method to generate STD based on the GAN algorithm, while preserving data with logical relationships. Methods: The proposed method was evaluated on data from the Korea Association for Lung Cancer Registry (KALC-R) and 2 benchmark data sets (breast cancer and diabetes). The DC-based SDG strategy comprises 3 steps: (1) We used 2 different partitioning methods (the class-specific criterion distinguished between survival and death groups, while the Cramer V criterion identified the highest correlation between columns in the original data); (2) the entire data set was divided into a number of subsets, which were then used as input for the conditional tabular generative adversarial network and the copula generative adversarial network to generate synthetic data; and (3) the generated synthetic data were consolidated into a single entity. For validation, we compared DC-based SDG and conditional sampling (CS)--based SDG through the performances of machine learning models. In addition, we generated imbalanced and balanced synthetic data for each of the 3 data sets and compared their performance using 4 classifiers: decision tree (DT), random forest (RF), Extreme Gradient Boosting (XGBoost), and light gradient-boosting machine (LGBM) models. Results: The synthetic data of the 3 diseases (non--small cell lung cancer [NSCLC], breast cancer, and diabetes) generated by our proposed model outperformed the 4 classifiers (DT, RF, XGBoost, and LGBM). The CS- versus DC-based model performances were compared using the mean area under the curve (SD) values: 74.87 (SD 0.77) versus 63.87 (SD 2.02) for NSCLC, 73.31 (SD 1.11) versus 67.96 (SD 2.15) for breast cancer, and 61.57 (SD 0.09) versus 60.08 (SD 0.17) for diabetes (DT); 85.61 (SD 0.29) versus 79.01 (SD 1.20) for NSCLC, 78.05 (SD 1.59) versus 73.48 (SD 4.73) for breast cancer, and 59.98 (SD 0.24) versus 58.55 (SD 0.17) for diabetes (RF); 85.20 (SD 0.82) versus 76.42 (SD 0.93) for NSCLC, 77.86 (SD 2.27) versus 68.32 (SD 2.37) for breast cancer, and 60.18 (SD 0.20) versus 58.98 (SD 0.29) for diabetes (XGBoost); and 85.14 (SD 0.77) versus 77.62 (SD 1.85) for NSCLC, 78.16 (SD 1.52) versus 70.02 (SD 2.17) for breast cancer, and 61.75 (SD 0.13) versus 61.12 (SD 0.23) for diabetes (LGBM). In addition, we found that balanced synthetic data performed better. Conclusions: This study is the first attempt to generate and validate STD based on a DC approach and shows improved performance using STD. The necessity for balanced SDG was also demonstrated. ", doi="10.2196/47859", url="https://medinform.jmir.org/2023/1/e47859", url="http://www.ncbi.nlm.nih.gov/pubmed/37999942" } @Article{info:doi/10.2196/48030, author="Pirmani, Ashkan and De Brouwer, Edward and Geys, Lotte and Parciak, Tina and Moreau, Yves and Peeters, M. Liesbet", title="The Journey of Data Within a Global Data Sharing Initiative: A Federated 3-Layer Data Analysis Pipeline to Scale Up Multiple Sclerosis Research", journal="JMIR Med Inform", year="2023", month="Nov", day="9", volume="11", pages="e48030", keywords="data analysis pipeline", keywords="federated model sharing", keywords="real-world data", keywords="evidence-based decision-making", keywords="end-to-end pipeline", keywords="multiple sclerosis", keywords="data analysis", keywords="pipeline", keywords="data science", keywords="federated", keywords="neurology", keywords="brain", keywords="spine", keywords="spinal nervous system", keywords="neuroscience", keywords="data sharing", keywords="rare", keywords="low prevalence", abstract="Background: Investigating low-prevalence diseases such as multiple sclerosis is challenging because of the rather small number of individuals affected by this disease and the scattering of real-world data across numerous data sources. These obstacles impair data integration, standardization, and analysis, which negatively impact the generation of significant meaningful clinical evidence. Objective: This study aims to present a comprehensive, research question--agnostic, multistakeholder-driven end-to-end data analysis pipeline that accommodates 3 prevalent data-sharing streams: individual data sharing, core data set sharing, and federated model sharing. Methods: A demand-driven methodology is employed for standardization, followed by 3 streams of data acquisition, a data quality enhancement process, a data integration procedure, and a concluding analysis stage to fulfill real-world data-sharing requirements. This pipeline's effectiveness was demonstrated through its successful implementation in the COVID-19 and multiple sclerosis global data sharing initiative. Results: The global data sharing initiative yielded multiple scientific publications and provided extensive worldwide guidance for the community with multiple sclerosis. The pipeline facilitated gathering pertinent data from various sources, accommodating distinct sharing streams and assimilating them into a unified data set for subsequent statistical analysis or secure data examination. This pipeline contributed to the assembly of the largest data set of people with multiple sclerosis infected with COVID-19. Conclusions: The proposed data analysis pipeline exemplifies the potential of global stakeholder collaboration and underlines the significance of evidence-based decision-making. It serves as a paradigm for how data sharing initiatives can propel advancements in health care, emphasizing its adaptability and capacity to address diverse research inquiries. ", doi="10.2196/48030", url="https://medinform.jmir.org/2023/1/e48030", url="http://www.ncbi.nlm.nih.gov/pubmed/37943585" } @Article{info:doi/10.2196/49400, author="Luo, Tingyan and Zhou, Jie and Yang, Jing and Xie, Yulan and Wei, Yiru and Mai, Huanzhuo and Lu, Dongjia and Yang, Yuecong and Cui, Ping and Ye, Li and Liang, Hao and Huang, Jiegang", title="Early Warning and Prediction of Scarlet Fever in China Using the Baidu Search Index and Autoregressive Integrated Moving Average With Explanatory Variable (ARIMAX) Model: Time Series Analysis", journal="J Med Internet Res", year="2023", month="Oct", day="30", volume="25", pages="e49400", keywords="scarlet fever", keywords="Baidu search index", keywords="autoregressive integrated moving average", keywords="ARIMA", keywords="warning", keywords="prediction", abstract="Background: Internet-derived data and the autoregressive integrated moving average (ARIMA) and ARIMA with explanatory variable (ARIMAX) models are extensively used for infectious disease surveillance. However, the effectiveness of the Baidu search index (BSI) in predicting the incidence of scarlet fever remains uncertain. Objective: Our objective was to investigate whether a low-cost BSI monitoring system could potentially function as a valuable complement to traditional scarlet fever surveillance in China. Methods: ARIMA and ARIMAX models were developed to predict the incidence of scarlet fever in China using data from the National Health Commission of the People's Republic of China between January 2011 and August 2022. The procedures included establishing a keyword database, keyword selection and filtering through Spearman rank correlation and cross-correlation analyses, construction of the scarlet fever comprehensive search index (CSI), modeling with the training sets, predicting with the testing sets, and comparing the prediction performances. Results: The average monthly incidence of scarlet fever was 4462.17 (SD 3011.75) cases, and annual incidence exhibited an upward trend until 2019. The keyword database contained 52 keywords, but only 6 highly relevant ones were selected for modeling. A high Spearman rank correlation was observed between the scarlet fever reported cases and the scarlet fever CSI (rs=0.881). We developed the ARIMA(4,0,0)(0,1,2)(12) model, and the ARIMA(4,0,0)(0,1,2)(12) + CSI (Lag=0) and ARIMAX(1,0,2)(2,0,0)(12) models were combined with the BSI. The 3 models had a good fit and passed the residuals Ljung-Box test. The ARIMA(4,0,0)(0,1,2)(12), ARIMA(4,0,0)(0,1,2)(12) + CSI (Lag=0), and ARIMAX(1,0,2)(2,0,0)(12) models demonstrated favorable predictive capabilities, with mean absolute errors of 1692.16 (95\% CI 584.88-2799.44), 1067.89 (95\% CI 402.02-1733.76), and 639.75 (95\% CI 188.12-1091.38), respectively; root mean squared errors of 2036.92 (95\% CI 929.64-3144.20), 1224.92 (95\% CI 559.04-1890.79), and 830.80 (95\% CI 379.17-1282.43), respectively; and mean absolute percentage errors of 4.33\% (95\% CI 0.54\%-8.13\%), 3.36\% (95\% CI --0.24\% to 6.96\%), and 2.16\% (95\% CI --0.69\% to 5.00\%), respectively. The ARIMAX models outperformed the ARIMA models and had better prediction performances with smaller values. Conclusions: This study demonstrated that the BSI can be used for the early warning and prediction of scarlet fever, serving as a valuable supplement to traditional surveillance systems. ", doi="10.2196/49400", url="https://www.jmir.org/2023/1/e49400", url="http://www.ncbi.nlm.nih.gov/pubmed/37902815" } @Article{info:doi/10.2196/46992, author="Pyper, Evelyn and McKeown, Sarah and Hartmann-Boyce, Jamie and Powell, John", title="Digital Health Technology for Real-World Clinical Outcome Measurement Using Patient-Generated Data: Systematic Scoping Review", journal="J Med Internet Res", year="2023", month="Oct", day="11", volume="25", pages="e46992", keywords="real-world evidence", keywords="real-world data", keywords="digital tools", keywords="digital health", keywords="digital biomarkers", keywords="patient-generated health data", keywords="mobile health", keywords="mHealth", keywords="wearables", keywords="digital health management", keywords="clinical intervention", keywords="electronic health record", keywords="health outcomes", keywords="mobile phone", abstract="Background: Digital health technologies (DHTs) play an ever-expanding role in health care management and delivery. Beyond their use as interventions, DHTs also serve as a vehicle for real-world data collection to characterize patients, their care journeys, and their responses to other clinical interventions. There is a need to comprehensively map the evidence---across all conditions and technology types---on DHT measurement of patient outcomes in the real world. Objective: We aimed to investigate the use of DHTs to measure real-world clinical outcomes using patient-generated data. Methods: We conducted this systematic scoping review in accordance with the Joanna Briggs Institute methodology. Detailed eligibility criteria documented in a preregistered protocol informed a search strategy for the following databases: MEDLINE (Ovid), CINAHL, Cochrane (CENTRAL), Embase, PsycINFO, ClinicalTrials.gov, and the EU Clinical Trials Register. We considered studies published between 2000 and 2022 wherein digital health data were collected, passively or actively, from patients with any specified health condition outside of clinical visits. Categories for key concepts, such as DHT type and analytical applications, were established where needed. Following screening and full-text review, data were extracted and analyzed using predefined fields, and findings were reported in accordance with established guidelines. Results: The search strategy identified 11,015 publications, with 7308 records after duplicates and reviews were removed. After screening and full-text review, 510 studies were included for extraction. These studies encompassed 169 different conditions in over 20 therapeutic areas and 44 countries. The DHTs used for mental health and addictions research (111/510, 21.8\%) were the most prevalent. The most common type of DHT, mobile apps, was observed in approximately half of the studies (250/510, 49\%). Most studies used only 1 DHT (346/510, 67.8\%); however, the majority of technologies used were able to collect more than 1 type of data, with the most common being physiological data (189/510, 37.1\%), clinical symptoms data (188/510, 36.9\%), and behavioral data (171/510, 33.5\%). Overall, there has been real growth in the depth and breadth of evidence, number of DHT types, and use of artificial intelligence and advanced analytics over time. Conclusions: This scoping review offers a comprehensive view of the variety of types of technology, data, collection methods, analytical approaches, and therapeutic applications within this growing body of evidence. To unlock the full potential of DHT for measuring health outcomes and capturing digital biomarkers, there is a need for more rigorous research that goes beyond technology validation to demonstrate whether robust real-world data can be reliably captured from patients in their daily life and whether its capture improves patient outcomes. This study provides a valuable repository of DHT studies to inform subsequent research by health care providers, policy makers, and the life sciences industry. Trial Registration: Open Science Framework 5TMKY; https://osf.io/5tmky/ ", doi="10.2196/46992", url="https://www.jmir.org/2023/1/e46992", url="http://www.ncbi.nlm.nih.gov/pubmed/37819698" } @Article{info:doi/10.2196/44310, author="Guo, Manping and Wang, Yiming and Yang, Qiaoning and Li, Rui and Zhao, Yang and Li, Chenfei and Zhu, Mingbo and Cui, Yao and Jiang, Xin and Sheng, Song and Li, Qingna and Gao, Rui", title="Normal Workflow and Key Strategies for Data Cleaning Toward Real-World Data: Viewpoint", journal="Interact J Med Res", year="2023", month="Sep", day="21", volume="12", pages="e44310", keywords="data cleaning", keywords="data quality", keywords="key technologies", keywords="real-world data", keywords="viewpoint", doi="10.2196/44310", url="https://www.i-jmr.org/2023/1/e44310", url="http://www.ncbi.nlm.nih.gov/pubmed/37733421" } @Article{info:doi/10.2196/45846, author="Zhang, Wang and Zhu, Zhu and Zhao, Yonggen and Li, Zheming and Chen, Lingdong and Huang, Jian and Li, Jing and Yu, Gang", title="Analyzing and Forecasting Pediatric Fever Clinic Visits in High Frequency Using Ensemble Time-Series Methods After the COVID-19 Pandemic in Hangzhou, China: Retrospective Study", journal="JMIR Med Inform", year="2023", month="Sep", day="20", volume="11", pages="e45846", keywords="time-series forecasting", keywords="outpatient visits", keywords="hospital management", keywords="pediatric fever clinic", keywords="long sequence", keywords="visits in high frequency", keywords="COVID-19", abstract="Background: The COVID-19 pandemic has significantly altered the global health and medical landscape. In response to the outbreak, Chinese hospitals have established 24-hour fever clinics to serve patients with COVID-19. The emergence of these clinics and the impact of successive epidemics have led to a surge in visits, placing pressure on hospital resource allocation and scheduling. Therefore, accurate prediction of outpatient visits is essential for informed decision-making in hospital management. Objective: Hourly visits to fever clinics can be characterized as a long-sequence time series in high frequency, which also exhibits distinct patterns due to the particularity of pediatric treatment behavior in an epidemic context. This study aimed to build models to forecast fever clinic visit with outstanding prediction accuracy and robust generalization in forecast horizons. In addition, this study hopes to provide a research paradigm for time-series forecasting problems, which involves an exploratory analysis revealing data patterns before model development. Methods: An exploratory analysis, including graphical analysis, autocorrelation analysis, and seasonal-trend decomposition, was conducted to reveal the seasonality and structural patterns of the retrospective fever clinic visit data. The data were found to exhibit multiseasonality and nonlinearity. On the basis of these results, an ensemble of time-series analysis methods, including individual models and their combinations, was validated on the data set. Root mean square error and mean absolute error were used as accuracy metrics, with the cross-validation of rolling forecasting origin conducted across different forecast horizons. Results: Hybrid models generally outperformed individual models across most forecast horizons. A novel model combination, the hybrid neural network autoregressive (NNAR)-seasonal and trend decomposition using Loess forecasting (STLF), was identified as the optimal model for our forecasting task, with the best performance in all accuracy metrics (root mean square error=20.1, mean absolute error=14.3) for the 15-days-ahead forecasts and an overall advantage for forecast horizons that were 1 to 30 days ahead. Conclusions: Although forecast accuracy tends to decline with an increasing forecast horizon, the hybrid NNAR-STLF model is applicable for short-, medium-, and long-term forecasts owing to its ability to fit multiseasonality (captured by the STLF component) and nonlinearity (captured by the NNAR component). The model identified in this study is also applicable to hospitals in other regions with similar epidemic outpatient configurations or forecasting tasks whose data conform to long-sequence time series in high frequency exhibiting multiseasonal and nonlinear patterns. However, as external variables and disruptive events were not accounted for, the model performance declined slightly following changes in the COVID-19 containment policy in China. Future work may seek to improve accuracy by incorporating external variables that characterize moving events or other factors as well as by adding data from different organizations to enhance algorithm generalization. ", doi="10.2196/45846", url="https://medinform.jmir.org/2023/1/e45846", url="http://www.ncbi.nlm.nih.gov/pubmed/37728972" } @Article{info:doi/10.2196/49593, author="Shau, Wen-Yi and Setia, Sajita and Chen, Ying-Jan and Ho, Tsu-yun and Prakash Shinde, Salil and Santoso, Handoko and Furtner, Daniel", title="Integrated Real-World Study Databases in 3 Diverse Asian Health Care Systems in Taiwan, India, and Thailand: Scoping Review", journal="J Med Internet Res", year="2023", month="Sep", day="11", volume="25", pages="e49593", keywords="Asia", keywords="health care databases", keywords="real-world data", keywords="real-world evidence", keywords="scoping review", abstract="Background: The use of real-world data (RWD) warehouses for research in Asia is on the rise, but current trends remain largely unexplored. Given the varied economic and health care landscapes in different Asian countries, understanding these trends can offer valuable insights. Objective: We sought to discern the contemporary landscape of linked RWD warehouses and explore their trends and patterns in 3 Asian countries with contrasting economies and health care systems: Taiwan, India, and Thailand. Methods: Using a systematic scoping review methodology, we conducted an exhaustive literature search on PubMed with filters for the English language and the past 5 years. The search combined Medical Subject Heading terms and specific keywords. Studies were screened against strict eligibility criteria to identify eligible studies using RWD databases from more than one health care facility in at least 1 of the 3 target countries. Results: Our search yielded 2277 studies, of which 833 (36.6\%) met our criteria. Overall, single-country studies (SCS) dominated at 89.4\% (n=745), with cross-country collaboration studies (CCCS) being at 10.6\% (n=88). However, the country-wise breakdown showed that of all the SCS, 623 (83.6\%) were from Taiwan, 81 (10.9\%) from India, and 41 (5.5\%) from Thailand. Among the total studies conducted in each country, India at 39.1\% (n=133) and Thailand at 43.1\% (n=72) had a significantly higher percentage of CCCS compared to Taiwan at 7.6\% (n=51). Over a 5-year span from 2017 to 2022, India and Thailand experienced an annual increase in RWD studies by approximately 18.2\% and 13.8\%, respectively, while Taiwan's contributions remained consistent. Comparative effectiveness research (CER) was predominant in Taiwan (n=410, or 65.8\% of SCS) but less common in India (n=12, or 14.8\% of SCS) and Thailand (n=11, or 26.8\% of SCS). CER percentages in CCCS were similar across the 3 countries, ranging from 19.2\% (n=10) to 29\% (n=9). The type of RWD source also varied significantly across countries, with India demonstrating a high reliance on electronic medical records or electronic health records at 55.6\% (n=45) of SCS and Taiwan showing an increasing trend in their use over the period. Registries were used in 26 (83.9\%) CCCS and 31 (75.6\%) SCS from Thailand but in <50\% of SCS from Taiwan and India. Health insurance/administrative claims data were used in most of the SCS from Taiwan (n=458, 73.5\%). There was a consistent predominant focus on cardiology/metabolic disorders in all studies, with a noticeable increase in oncology and infectious disease research from 2017 to 2022. Conclusions: This review provides a comprehensive understanding of the evolving landscape of RWD research in Taiwan, India, and Thailand. The observed differences and trends emphasize the unique economic, clinical, and research settings in each country, advocating for tailored strategies for leveraging RWD for future health care research and decision-making. International Registered Report Identifier (IRRID): RR2-10.2196/43741 ", doi="10.2196/49593", url="https://www.jmir.org/2023/1/e49593", url="http://www.ncbi.nlm.nih.gov/pubmed/37615085" } @Article{info:doi/10.2196/48763, author="Klement, William and El Emam, Khaled", title="Consolidated Reporting Guidelines for Prognostic and Diagnostic Machine Learning Modeling Studies: Development and Validation", journal="J Med Internet Res", year="2023", month="Aug", day="31", volume="25", pages="e48763", keywords="machine learning", keywords="prognostic models", keywords="prediction models", keywords="reporting guidelines", keywords="reproducibility guidelines", keywords="diagnostic", keywords="prognostic", keywords="model evaluation", keywords="model training", abstract="Background: The reporting of machine learning (ML) prognostic and diagnostic modeling studies is often inadequate, making it difficult to understand and replicate such studies. To address this issue, multiple consensus and expert reporting guidelines for ML studies have been published. However, these guidelines cover different parts of the analytics lifecycle, and individually, none of them provide a complete set of reporting requirements. Objective: We aimed to consolidate the ML reporting guidelines and checklists in the literature to provide reporting items for prognostic and diagnostic ML in in-silico and shadow mode studies. Methods: We conducted a literature search that identified 192 unique peer-reviewed English articles that provide guidance and checklists for reporting ML studies. The articles were screened by their title and abstract against a set of 9 inclusion and exclusion criteria. Articles that were filtered through had their quality evaluated by 2 raters using a 9-point checklist constructed from guideline development good practices. The average $\kappa$ was 0.71 across all quality criteria. The resulting 17 high-quality source papers were defined as having a quality score equal to or higher than the median. The reporting items in these 17 articles were consolidated and screened against a set of 6 inclusion and exclusion criteria. The resulting reporting items were sent to an external group of 11 ML experts for review and updated accordingly. The updated checklist was used to assess the reporting in 6 recent modeling papers in JMIR AI. Feedback from the external review and initial validation efforts was used to improve the reporting items. Results: In total, 37 reporting items were identified and grouped into 5 categories based on the stage of the ML project: defining the study details, defining and collecting the data, modeling methodology, model evaluation, and explainability. None of the 17 source articles covered all the reporting items. The study details and data description reporting items were the most common in the source literature, with explainability and methodology guidance (ie, data preparation and model training) having the least coverage. For instance, a median of 75\% of the data description reporting items appeared in each of the 17 high-quality source guidelines, but only a median of 33\% of the data explainability reporting items appeared. The highest-quality source articles tended to have more items on reporting study details. Other categories of reporting items were not related to the source article quality. We converted the reporting items into a checklist to support more complete reporting. Conclusions: Our findings supported the need for a set of consolidated reporting items, given that existing high-quality guidelines and checklists do not individually provide complete coverage. The consolidated set of reporting items is expected to improve the quality and reproducibility of ML modeling studies. ", doi="10.2196/48763", url="https://www.jmir.org/2023/1/e48763", url="http://www.ncbi.nlm.nih.gov/pubmed/37651179" } @Article{info:doi/10.2196/45013, author="Inau, Thea Esther and Sack, Jean and Waltemath, Dagmar and Zeleke, Alamirrew Atinkut", title="Initiatives, Concepts, and Implementation Practices of the Findable, Accessible, Interoperable, and Reusable Data Principles in Health Data Stewardship: Scoping Review", journal="J Med Internet Res", year="2023", month="Aug", day="28", volume="25", pages="e45013", keywords="data stewardship", keywords="findable, accessible, interoperable, and reusable data principles", keywords="FAIR data principles", keywords="health research", keywords="Preferred Reporting Items for Systematic Reviews and Meta-Analyses", keywords="PRISMA", keywords="qualitative analysis", keywords="scoping review", keywords="information retrieval", keywords="health information exchange", abstract="Background: Thorough data stewardship is a key enabler of comprehensive health research. Processes such as data collection, storage, access, sharing, and analytics require researchers to follow elaborate data management strategies properly and consistently. Studies have shown that findable, accessible, interoperable, and reusable (FAIR) data leads to improved data sharing in different scientific domains. Objective: This scoping review identifies and discusses concepts, approaches, implementation experiences, and lessons learned in FAIR initiatives in health research data. Methods: The Arksey and O'Malley stage-based methodological framework for scoping reviews was applied. PubMed, Web of Science, and Google Scholar were searched to access relevant publications. Articles written in English, published between 2014 and 2020, and addressing FAIR concepts or practices in the health domain were included. The 3 data sources were deduplicated using a reference management software. In total, 2 independent authors reviewed the eligibility of each article based on defined inclusion and exclusion criteria. A charting tool was used to extract information from the full-text papers. The results were reported using the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews) guidelines. Results: A total of 2.18\% (34/1561) of the screened articles were included in the final review. The authors reported FAIRification approaches, which include interpolation, inclusion of comprehensive data dictionaries, repository design, semantic interoperability, ontologies, data quality, linked data, and requirement gathering for FAIRification tools. Challenges and mitigation strategies associated with FAIRification, such as high setup costs, data politics, technical and administrative issues, privacy concerns, and difficulties encountered in sharing health data despite its sensitive nature were also reported. We found various workflows, tools, and infrastructures designed by different groups worldwide to facilitate the FAIRification of health research data. We also uncovered a wide range of problems and questions that researchers are trying to address by using the different workflows, tools, and infrastructures. Although the concept of FAIR data stewardship in the health research domain is relatively new, almost all continents have been reached by at least one network trying to achieve health data FAIRness. Documented outcomes of FAIRification efforts include peer-reviewed publications, improved data sharing, facilitated data reuse, return on investment, and new treatments. Successful FAIRification of data has informed the management and prognosis of various diseases such as cancer, cardiovascular diseases, and neurological diseases. Efforts to FAIRify data on a wider variety of diseases have been ongoing since the COVID-19 pandemic. Conclusions: This work summarises projects, tools, and workflows for the FAIRification of health research data. The comprehensive review shows that implementing the FAIR concept in health data stewardship carries the promise of improved research data management and transparency in the era of big data and open research publishing. International Registered Report Identifier (IRRID): RR2-10.2196/22505 ", doi="10.2196/45013", url="https://www.jmir.org/2023/1/e45013", url="http://www.ncbi.nlm.nih.gov/pubmed/37639292" } @Article{info:doi/10.2196/44842, author="Nan, Jingwen and Xu, Li-Qun", title="Designing Interoperable Health Care Services Based on Fast Healthcare Interoperability Resources: Literature Review", journal="JMIR Med Inform", year="2023", month="Aug", day="21", volume="11", pages="e44842", keywords="Health level 7 Fast Healthcare Interoperability Resources", keywords="HL7 FHIR", keywords="interoperability", keywords="literature review", keywords="practice guideline", keywords="mobile phone", abstract="Background: With the advent of the digital economy and the aging population, the demand for diversified health care services and innovative care delivery models has been overwhelming. This trend has accelerated the urgency to implement effective and efficient data exchange and service interoperability, which underpins coordinated care services among tiered health care institutions, improves the quality of oversight of regulators, and provides vast and comprehensive data collection to support clinical medicine and health economics research, thus improving the overall service quality and patient satisfaction. To meet this demand and facilitate the interoperability of IT systems of stakeholders, after years of preparation, Health Level 7 formally introduced, in 2014, the Fast Healthcare Interoperability Resources (FHIR) standard. It has since continued to evolve. FHIR depends on the Implementation Guide (IG) to ensure feasibility and consistency while developing an interoperable health care service. The IG defines rules with associated documentation on how FHIR resources are used to tackle a particular problem. However, a gap remains between IGs and the process of building actual services because IGs are rules without specifying concrete methods, procedures, or tools. Thus, stakeholders may feel it nontrivial to participate in the ecosystem, giving rise to the need for a more actionable practice guideline (PG) for promoting FHIR's fast adoption. Objective: This study aimed to propose a general FHIR PG to facilitate stakeholders in the health care ecosystem to understand FHIR and quickly develop interoperable health care services. Methods: We selected a collection of FHIR-related papers about the latest studies or use cases on designing and building FHIR-based interoperable health care services and tagged each use case as belonging to 1 of the 3 dominant innovation feature groups that are also associated with practice stages, that is, data standardization, data management, and data integration. Next, we reviewed each group's detailed process and key techniques to build respective care services and collate a complete FHIR PG. Finally, as an example, we arbitrarily selected a use case outside the scope of the reviewed papers and mapped it back to the FHIR PG to demonstrate the effectiveness and generalizability of the PG. Results: The FHIR PG includes 2 core elements: one is a practice design that defines the responsibilities of stakeholders and outlines the complete procedure from data to services, and the other is a development architecture for practice design, which lists the available tools for each practice step and provides direct and actionable recommendations. Conclusions: The FHIR PG can bridge the gap between IGs and the process of building actual services by proposing actionable methods, procedures, and tools. It assists stakeholders in identifying participants' roles, managing the scope of responsibilities, and developing relevant modules, thus helping promote FHIR-based interoperable health care services. ", doi="10.2196/44842", url="https://medinform.jmir.org/2023/1/e44842", url="http://www.ncbi.nlm.nih.gov/pubmed/37603388" } @Article{info:doi/10.2196/41805, author="Esmaeilzadeh, Pouyan and Mirzaei, Tala", title="Role of Incentives in the Use of Blockchain-Based Platforms for Sharing Sensitive Health Data: Experimental Study", journal="J Med Internet Res", year="2023", month="Aug", day="18", volume="25", pages="e41805", keywords="blockchain technology", keywords="data sharing", keywords="health data", keywords="clinical research", keywords="incentive mechanisms", abstract="Background: Blockchain is an emerging technology that enables secure and decentralized approaches to reduce technical risks and governance challenges associated with sharing data. Although blockchain-based solutions have been suggested for sharing health information, it is still unclear whether a suitable incentive mechanism (intrinsic or extrinsic) can be identified to encourage individuals to share their sensitive data for research purposes. Objective: This study aimed to investigate how important extrinsic incentives are and what type of incentive is the best option in blockchain-based platforms designed for sharing sensitive health information. Methods: In this study, we conducted 3 experiments with 493 individuals to investigate the role of extrinsic incentives (ie, cryptocurrency, money, and recognition) in data sharing with research organizations. Results: The findings highlight that offering different incentives is insufficient to encourage individuals to use blockchain technology or to change their perceptions about the technology's premise for sharing sensitive health data. The results demonstrate that individuals still attribute serious risks to blockchain-based platforms. Privacy and security concerns, trust issues, lack of knowledge about the technology, lack of public acceptance, and lack of regulations are reported as top risks. In terms of attracting people to use blockchain-based platforms for data sharing in health care, we show that the effects of extrinsic motivations (cryptoincentives, money, and status) are significantly overshadowed by inhibitors to technology use. Conclusions: We suggest that before emphasizing the use of various types of extrinsic incentives, the users must be educated about the capabilities and benefits offered by this technology. Thus, an essential first step for shifting from an institution-based data exchange to a patient-centric data exchange (using blockchain) is addressing technology inhibitors to promote patient-driven data access control. This study shows that extrinsic incentives alone are inadequate to change users' perceptions, increase their trust, or encourage them to use technology for sharing health data. ", doi="10.2196/41805", url="https://www.jmir.org/2023/1/e41805", url="http://www.ncbi.nlm.nih.gov/pubmed/37594783" } @Article{info:doi/10.2196/45614, author="Boussina, Aaron and Wardi, Gabriel and Shashikumar, Prajwal Supreeth and Malhotra, Atul and Zheng, Kai and Nemati, Shamim", title="Representation Learning and Spectral Clustering for the Development and External Validation of Dynamic Sepsis Phenotypes: Observational Cohort Study", journal="J Med Internet Res", year="2023", month="Jun", day="23", volume="25", pages="e45614", keywords="sepsis", keywords="phenotype", keywords="emergency service, hospital", keywords="disease progression", keywords="artificial intelligence", keywords="machine learning", keywords="emergency", keywords="infection", keywords="clinical phenotype", keywords="clinical phenotyping", keywords="transition model", keywords="transition modeling", abstract="Background: Recent attempts at clinical phenotyping for sepsis have shown promise in identifying groups of patients with distinct treatment responses. Nonetheless, the replicability and actionability of these phenotypes remain an issue because the patient trajectory is a function of both the patient's physiological state and the interventions they receive. Objective: We aimed to develop a novel approach for deriving clinical phenotypes using unsupervised learning and transition modeling. Methods: Forty commonly used clinical variables from the electronic health record were used as inputs to a feed-forward neural network trained to predict the onset of sepsis. Using spectral clustering on the representations from this network, we derived and validated consistent phenotypes across a diverse cohort of patients with sepsis. We modeled phenotype dynamics as a Markov decision process with transitions as a function of the patient's current state and the interventions they received. Results: Four consistent and distinct phenotypes were derived from over 11,500 adult patients who were admitted from the University of California, San Diego emergency department (ED) with sepsis between January 1, 2016, and January 31, 2020. Over 2000 adult patients admitted from the University of California, Irvine ED with sepsis between November 4, 2017, and August 4, 2022, were involved in the external validation. We demonstrate that sepsis phenotypes are not static and evolve in response to physiological factors and based on interventions. We show that roughly 45\% of patients change phenotype membership within the first 6 hours of ED arrival. We observed consistent trends in patient dynamics as a function of interventions including early administration of antibiotics. Conclusions: We derived and describe 4 sepsis phenotypes present within 6 hours of triage in the ED. We observe that the administration of a 30 mL/kg fluid bolus may be associated with worse outcomes in certain phenotypes, whereas prompt antimicrobial therapy is associated with improved outcomes. ", doi="10.2196/45614", url="https://www.jmir.org/2023/1/e45614", url="http://www.ncbi.nlm.nih.gov/pubmed/37351927" } @Article{info:doi/10.2196/41048, author="Ranchon, Florence and Chanoine, S{\'e}bastien and Lambert-Lacroix, Sophie and Bosson, Jean-Luc and Moreau-Gaudry, Alexandre and Bedouch, Pierrick", title="Development of Indirect Health Data Linkage on Health Product Use and Care Trajectories in France: Systematic Review", journal="J Med Internet Res", year="2023", month="May", day="18", volume="25", pages="e41048", keywords="data linkage", keywords="health database", keywords="deterministic approach", keywords="probabilistic approach", keywords="health products", keywords="public health activity", keywords="health data", keywords="linkage", keywords="France", keywords="big data", keywords="usability", keywords="integration", keywords="care trajectories", abstract="Background: European national disparities in the integration of data linkage (ie, being able to match patient data between databases) into routine public health activities were recently highlighted. In France, the claims database covers almost the whole population from birth to death, offering a great research potential for data linkage. As the use of a common unique identifier to directly link personal data is often limited, linkage with a set of indirect key identifiers has been developed, which is associated with the linkage quality challenge to minimize errors in linked data. Objective: The aim of this systematic review is to analyze the type and quality of research publications on indirect data linkage on health product use and care trajectories in France. Methods: A comprehensive search for all papers published in PubMed/Medline and Embase databases up to December 31, 2022, involving linked French database focusing on health products use or care trajectories was realized. Only studies based on the use of indirect identifiers were included (ie, without a unique personal identifier available to easily link the databases). A descriptive analysis of data linkage with quality indicators and adherence to the Bohensky framework for evaluating data linkage studies was also realized. Results: In total, 16 papers were selected. Data linkage was performed at the national level in 7 (43.8\%) cases or at the local level in 9 (56.2\%) studies. The number of patients included in the different databases and resulting from data linkage varied greatly, respectively, from 713 to 75,000 patients and from 210 to 31,000 linked patients. The diseases studied were mainly chronic diseases and infections. The objectives of the data linkage were multiple: to estimate the risk of adverse drug reactions (ADRs; n=6, 37.5\%), to reconstruct the patient's care trajectory (n=5, 31.3\%), to describe therapeutic uses (n=2, 12.5\%), to evaluate the benefits of treatments (n=2, 12.5\%), and to evaluate treatment adherence (n=1, 6.3\%). Registries are the most frequently linked databases with French claims data. No studies have looked at linking with a hospital data warehouse, a clinical trial database, or patient self-reported databases. The linkage approach was deterministic in 7 (43.8\%) studies, probabilistic in 4 (25.0\%) studies, and not specified in 5 (31.3\%) studies. The linkage rate was mainly from 80\% to 90\% (reported in 11/15, 73.3\%, studies). Adherence to the Bohensky framework for evaluating data linkage studies showed that the description of the source databases for the linkage was always performed but that the completion rate and accuracy of the variables to be linked were not systematically described. Conclusions: This review highlights the growing interest in health data linkage in France. Nevertheless, regulatory, technical, and human constraints remain major obstacles to their deployment. The volume, variety, and validity of the data represent a real challenge, and advanced expertise and skills in statistical analysis and artificial intelligence are required to treat these big data. ", doi="10.2196/41048", url="https://www.jmir.org/2023/1/e41048", url="http://www.ncbi.nlm.nih.gov/pubmed/37200084" } @Article{info:doi/10.2196/42734, author="Di Cara, H. Nina and Maggio, Valerio and Davis, P. Oliver S. and Haworth, A. Claire M.", title="Methodologies for Monitoring Mental Health on Twitter: Systematic Review", journal="J Med Internet Res", year="2023", month="May", day="8", volume="25", pages="e42734", keywords="social media", keywords="mental health", keywords="mental illness", keywords="machine learning", abstract="Background: The use of social media data to predict mental health outcomes has the potential to allow for the continuous monitoring of mental health and well-being and provide timely information that can supplement traditional clinical assessments. However, it is crucial that the methodologies used to create models for this purpose are of high quality from both a mental health and machine learning perspective. Twitter has been a popular choice of social media because of the accessibility of its data, but access to big data sets is not a guarantee of robust results. Objective: This study aims to review the current methodologies used in the literature for predicting mental health outcomes from Twitter data, with a focus on the quality of the underlying mental health data and the machine learning methods used. Methods: A systematic search was performed across 6 databases, using keywords related to mental health disorders, algorithms, and social media. In total, 2759 records were screened, of which 164 (5.94\%) papers were analyzed. Information about methodologies for data acquisition, preprocessing, model creation, and validation was collected, as well as information about replicability and ethical considerations. Results: The 164 studies reviewed used 119 primary data sets. There were an additional 8 data sets identified that were not described in enough detail to include, and 6.1\% (10/164) of the papers did not describe their data sets at all. Of these 119 data sets, only 16 (13.4\%) had access to ground truth data (ie,?known characteristics) about the mental health disorders of social media users. The other 86.6\% (103/119) of data sets collected data by searching keywords or phrases, which may not be representative of patterns of Twitter use for those with mental health disorders. The annotation of mental health disorders for classification labels was variable, and 57.1\% (68/119) of the data sets had no ground truth or clinical input on this annotation. Despite being a common mental health disorder, anxiety received little attention. Conclusions: The sharing of high-quality ground truth data sets is crucial for the development of trustworthy algorithms that have clinical and research utility. Further collaboration across disciplines and contexts is encouraged to better understand what types of predictions will be useful in supporting the management and identification of mental health disorders. A series of recommendations for researchers in this field and for the wider research community are made, with the aim of enhancing the quality and utility of future outputs. ", doi="10.2196/42734", url="https://www.jmir.org/2023/1/e42734", url="http://www.ncbi.nlm.nih.gov/pubmed/37155236" } @Article{info:doi/10.2196/43802, author="Annis, Ann and Reaves, Crista and Sender, Jessica and Bumpus, Sherry", title="Health-Related Data Sources Accessible to Health Researchers From the US Government: Mapping Review", journal="J Med Internet Res", year="2023", month="Apr", day="27", volume="25", pages="e43802", keywords="data sets as topic", keywords="federal government", keywords="data collection", keywords="survey", keywords="questionnaire", keywords="health surveys", keywords="big data", keywords="government", keywords="data set", keywords="public domain", keywords="data source", keywords="systematic review", keywords="mapping review", keywords="review method", keywords="open data", keywords="health research", abstract="Background: Big data from large, government-sponsored surveys and data sets offers researchers opportunities to conduct population-based studies of important health issues in the United States, as well as develop preliminary data to support proposed future work. Yet, navigating these national data sources is challenging. Despite the widespread availability of national data, there is little guidance for researchers on how to access and evaluate the use of these resources. Objective: Our aim was to identify and summarize a comprehensive list of federally sponsored, health- and health care--related data sources that are accessible in the public domain in order to facilitate their use by researchers. Methods: We conducted a systematic mapping review of government sources of health-related data on US populations and with active or recent (previous 10 years) data collection. The key measures were government sponsor, overview and purpose of data, population of interest, sampling design, sample size, data collection methodology, type and description of data, and cost to obtain data. Convergent synthesis was used to aggregate findings. Results: Among 106 unique data sources, 57 met the inclusion criteria. Data sources were classified as survey or assessment data (n=30, 53\%), trends data (n=27, 47\%), summative processed data (n=27, 47\%), primary registry data (n=17, 30\%), and evaluative data (n=11, 19\%). Most (n=39, 68\%) served more than 1 purpose. The population of interest included individuals/patients (n=40, 70\%), providers (n=15, 26\%), and health care sites and systems (n=14, 25\%). The sources collected data on demographic (n=44, 77\%) and clinical information (n=35, 61\%), health behaviors (n=24, 42\%), provider or practice characteristics (n=22, 39\%), health care costs (n=17, 30\%), and laboratory tests (n=8, 14\%). Most (n=43, 75\%) offered free data sets. Conclusions: A broad scope of national health data is accessible to researchers. These data provide insights into important health issues and the nation's health care system while eliminating the burden of primary data collection. Data standardization and uniformity were uncommon across government entities, highlighting a need to improve data consistency. Secondary analyses of national data are a feasible, cost-efficient means to address national health concerns. ", doi="10.2196/43802", url="https://www.jmir.org/2023/1/e43802", url="http://www.ncbi.nlm.nih.gov/pubmed/37103987" } @Article{info:doi/10.2196/45913, author="Lee, Heui Yoon and Jang, Yu-Jin and Lee, Soo-Kyoung", title="Obstacles to Health Big Data Utilization Based on the Perceptions and Demands of Health Care Workers in South Korea: Web-Based Survey Study", journal="JMIR Form Res", year="2023", month="Apr", day="13", volume="7", pages="e45913", keywords="demand", keywords="health big data", keywords="health care worker", keywords="obstacles", keywords="perception", keywords="utilization", abstract="Background: This study focuses on the potential of health big data in the South Korean context. Despite huge data reserves and pan-government efforts to increase data use, the utilization is limited to public interest research centered in public institutions that have data. To increase the use of health big data, it is necessary to identify and develop measures to meet the various demands for such data from individuals, private companies, and research institutes. Objective: The aim of this study was to identify the perceptions of and demands for health big data analysis and use among workers in health care--related occupations and to clarify the obstacles to the use of health big data. Methods: From May 8 to May 18, 2022, we conducted a web-based survey among 390 health care--related workers in South Korea. We used Fisher exact test and analysis of variance to estimate the differences among occupations. We expressed the analysis results by item in frequency and percentage and expressed the difficulties in analyzing health big data by mean and standard deviation. Results: The respondents who revealed the need to use health big data in health care work--related fields accounted for 86.4\% (337/390); 65.6\% (256/390) of the respondents had never used health big data. The lack of awareness about the source of the desired data was the most cited reason for nonuse by 39.6\% (153/386) of the respondents. The most cited obstacle to using health big data by the respondents was the difficulty in data integration and expression unit matching, followed by missing value processing and noise removal. Thus, the respondents experienced the greatest difficulty in the data preprocessing stage during the health big data analysis process, regardless of occupation. Approximately 91.8\% (358/390) of the participants responded that they were willing to use the system if a system supporting big data analysis was developed. As suggestions for the specific necessary support system, the reporting and provision of appropriate data and expert advice on questions arising during the overall process of big data analysis were mentioned. Conclusions: Our findings indicate respondents' high awareness of and demand for health big data. Our findings also reveal the low utilization of health big data and the need to support health care workers in their analysis and use of such data. Hence, we recommend the development of a customized support system that meets the specific requirements of big data analysis by users such as individuals, nongovernmental agencies, and academia. Our study is significant because it identified important but overlooked failure factors. Thus, it is necessary to prepare practical measures to increase the utilization of health big data in the future. ", doi="10.2196/45913", url="https://formative.jmir.org/2023/1/e45913", url="http://www.ncbi.nlm.nih.gov/pubmed/37052992" } @Article{info:doi/10.2196/42292, author="Amusa, Babatunde Lateef and Twinomurinzi, Hossana and Phalane, Edith and Phaswana-Mafuya, Nancy Refilwe", title="Big Data and Infectious Disease Epidemiology: Bibliometric Analysis and Research Agenda", journal="Interact J Med Res", year="2023", month="Mar", day="31", volume="12", pages="e42292", keywords="big data", keywords="bibliometrics", keywords="infectious disease", keywords="COVID-19", keywords="disease surveillance", keywords="disease", keywords="pandemic", keywords="data", keywords="surveillance", keywords="hotspot", keywords="epidemiology", keywords="social media", keywords="utility", keywords="electronic health records", abstract="Background: Infectious diseases represent a major challenge for health systems worldwide. With the recent global pandemic of COVID-19, the need to research strategies to treat these health problems has become even more pressing. Although the literature on big data and data science in health has grown rapidly, few studies have synthesized these individual studies, and none has identified the utility of big data in infectious disease surveillance and modeling. Objective: The aim of this study was to synthesize research and identify hotspots of big data in infectious disease epidemiology. Methods: Bibliometric data from 3054 documents that satisfied the inclusion criteria retrieved from the Web of Science database over 22 years (2000-2022) were analyzed and reviewed. The search retrieval occurred on October 17, 2022. Bibliometric analysis was performed to illustrate the relationships between research constituents, topics, and key terms in the retrieved documents. Results: The bibliometric analysis revealed internet searches and social media as the most utilized big data sources for infectious disease surveillance or modeling. The analysis also placed US and Chinese institutions as leaders in this research area. Disease monitoring and surveillance, utility of electronic health (or medical) records, methodology framework for infodemiology tools, and machine/deep learning were identified as the core research themes. Conclusions: Proposals for future studies are made based on these findings. This study will provide health care informatics scholars with a comprehensive understanding of big data research in infectious disease epidemiology. ", doi="10.2196/42292", url="https://www.i-jmr.org/2023/1/e42292", url="http://www.ncbi.nlm.nih.gov/pubmed/36913554" } @Article{info:doi/10.2196/40814, author="Hirst, Yasemin and Stoffel, T. Sandro and Brewer, R. Hannah and Timotijevic, Lada and Raats, M. Monique and Flanagan, M. James", title="Understanding Public Attitudes and Willingness to Share Commercial Data for Health Research: Survey Study in the United Kingdom", journal="JMIR Public Health Surveill", year="2023", month="Mar", day="23", volume="9", pages="e40814", keywords="commercial data", keywords="data sharing", keywords="participant recruitment", keywords="loyalty cards", keywords="sociodemographic factors", keywords="data donation", keywords="data", keywords="health", keywords="public", keywords="acceptability", keywords="digital", keywords="mobile phone", abstract="Background: Health research using commercial data is increasing. The evidence on public acceptability and sociodemographic characteristics of individuals willing to share commercial data for health research is scarce. Objective: This survey study investigates the willingness to share commercial data for health research in the United Kingdom with 3 different organizations (government, private, and academic institutions), 5 different data types (internet, shopping, wearable devices, smartphones, and social media), and 10 different invitation methods to recruit participants for research studies with a focus on sociodemographic characteristics and psychological predictors. Methods: We conducted a web-based survey using quota sampling based on age distribution in the United Kingdom in July 2020 (N=1534). Chi-squared tests tested differences by sociodemographic characteristics, and adjusted ordered logistic regressions tested associations with trust, perceived importance of privacy, worry about data misuse and perceived risks, and perceived benefits of data sharing. The results are shown as percentages, adjusted odds ratios, and 95\% CIs. Results: Overall, 61.1\% (937/1534) of participants were willing to share their data with the government and 61\% (936/1534) of participants were willing to share their data with academic research institutions compared with 43.1\% (661/1534) who were willing to share their data with private organizations. The willingness to share varied between specific types of data---51.8\% (794/1534) for loyalty cards, 35.2\% (540/1534) for internet search history, 32\% (491/1534) for smartphone data, 31.8\% (488/1534) for wearable device data, and 30.4\% (467/1534) for social media data. Increasing age was consistently and negatively associated with all the outcomes. Trust was positively associated with willingness to share commercial data, whereas worry about data misuse and the perceived importance of privacy were negatively associated with willingness to share commercial data. The perceived risk of sharing data was positively associated with willingness to share when the participants considered all the specific data types but not with the organizations. The participants favored postal research invitations over digital research invitations. Conclusions: This UK-based survey study shows that willingness to share commercial data for health research varies; however, researchers should focus on effectively communicating their data practices to minimize concerns about data misuse and improve public trust in data science. The results of this study can be further used as a guide to consider methods to improve recruitment strategies in health-related research and to improve response rates and participant retention. ", doi="10.2196/40814", url="https://publichealth.jmir.org/2023/1/e40814", url="http://www.ncbi.nlm.nih.gov/pubmed/36951929" } @Article{info:doi/10.2196/43847, author="Williams, Elena and Kienast, Manuel and Medawar, Evelyn and Reinelt, Janis and Merola, Alberto and Klopfenstein, Ines Sophie Anne and Flint, Rike Anne and Heeren, Patrick and Poncette, Akira-Sebastian and Balzer, Felix and Beimes, Julian and von B{\"u}nau, Paul and Chromik, Jonas and Arnrich, Bert and Scherf, Nico and Niehaus, Sebastian", title="A Standardized Clinical Data Harmonization Pipeline for Scalable AI Application Deployment (FHIR-DHP): Validation and Usability Study", journal="JMIR Med Inform", year="2023", month="Mar", day="21", volume="11", pages="e43847", keywords="data interoperability", keywords="fast healthcare interoperability resources", keywords="FHIR", keywords="data standardization pipeline", keywords="medical information mart for intensive care", keywords="MIMIC IV", keywords="artificial intelligence", keywords="AI application", keywords="AI", keywords="deployment", keywords="data", keywords="usability", keywords="care unit", keywords="diagnosis", keywords="cooperation", keywords="patient care", keywords="care", keywords="medical research", abstract="Background: Increasing digitalization in the medical domain gives rise to large amounts of health care data, which has the potential to expand clinical knowledge and transform patient care if leveraged through artificial intelligence (AI). Yet, big data and AI oftentimes cannot unlock their full potential at scale, owing to nonstandardized data formats, lack of technical and semantic data interoperability, and limited cooperation between stakeholders in the health care system. Despite the existence of standardized data formats for the medical domain, such as Fast Healthcare Interoperability Resources (FHIR), their prevalence and usability for AI remain limited. Objective: In this paper, we developed a data harmonization pipeline (DHP) for clinical data sets relying on the common FHIR data standard. Methods: We validated the performance and usability of our FHIR-DHP with data from the Medical Information Mart for Intensive Care IV database. Results: We present the FHIR-DHP workflow in respect of the transformation of ``raw'' hospital records into a harmonized, AI-friendly data representation. The pipeline consists of the following 5 key preprocessing steps: querying of data from hospital database, FHIR mapping, syntactic validation, transfer of harmonized data into the patient-model database, and export of data in an AI-friendly format for further medical applications. A detailed example of FHIR-DHP execution was presented for clinical diagnoses records. Conclusions: Our approach enables the scalable and needs-driven data modeling of large and heterogenous clinical data sets. The FHIR-DHP is a pivotal step toward increasing cooperation, interoperability, and quality of patient care in the clinical routine and for medical research. ", doi="10.2196/43847", url="https://medinform.jmir.org/2023/1/e43847", url="http://www.ncbi.nlm.nih.gov/pubmed/36943344" } @Article{info:doi/10.2196/42822, author="Sinaci, Anil A. and Gencturk, Mert and Teoman, Alper Huseyin and Laleci Erturkmen, Banu Gokce and Alvarez-Romero, Celia and Martinez-Garcia, Alicia and Poblador-Plou, Beatriz and Carmona-P{\'i}rez, Jon{\'a}s and L{\"o}be, Matthias and Parra-Calderon, Luis Carlos", title="A Data Transformation Methodology to Create Findable, Accessible, Interoperable, and Reusable Health Data: Software Design, Development, and Evaluation Study", journal="J Med Internet Res", year="2023", month="Mar", day="8", volume="25", pages="e42822", keywords="Health Level 7 Fast Healthcare Interoperability Resources", keywords="HL7 FHIR", keywords="Findable, Accessible, Interoperable, and Reusable principles", keywords="FAIR principles", keywords="health data sharing", keywords="health data transformation", keywords="secondary use", abstract="Background: Sharing health data is challenging because of several technical, ethical, and regulatory issues. The Findable, Accessible, Interoperable, and Reusable (FAIR) guiding principles have been conceptualized to enable data interoperability. Many studies provide implementation guidelines, assessment metrics, and software to achieve FAIR-compliant data, especially for health data sets. Health Level 7 (HL7) Fast Healthcare Interoperability Resources (FHIR) is a health data content modeling and exchange standard. Objective: Our goal was to devise a new methodology to extract, transform, and load existing health data sets into HL7 FHIR repositories in line with FAIR principles, develop a Data Curation Tool to implement the methodology, and evaluate it on health data sets from 2 different but complementary institutions. We aimed to increase the level of compliance with FAIR principles of existing health data sets through standardization and facilitate health data sharing by eliminating the associated technical barriers. Methods: Our approach automatically processes the capabilities of a given FHIR end point and directs the user while configuring mappings according to the rules enforced by FHIR profile definitions. Code system mappings can be configured for terminology translations through automatic use of FHIR resources. The validity of the created FHIR resources can be automatically checked, and the software does not allow invalid resources to be persisted. At each stage of our data transformation methodology, we used particular FHIR-based techniques so that the resulting data set could be evaluated as FAIR. We performed a data-centric evaluation of our methodology on health data sets from 2 different institutions. Results: Through an intuitive graphical user interface, users are prompted to configure the mappings into FHIR resource types with respect to the restrictions of selected profiles. Once the mappings are developed, our approach can syntactically and semantically transform existing health data sets into HL7 FHIR without loss of data utility according to our privacy-concerned criteria. In addition to the mapped resource types, behind the scenes, we create additional FHIR resources to satisfy several FAIR criteria. According to the data maturity indicators and evaluation methods of the FAIR Data Maturity Model, we achieved the maximum level (level 5) for being Findable, Accessible, and Interoperable and level 3 for being Reusable. Conclusions: We developed and extensively evaluated our data transformation approach to unlock the value of existing health data residing in disparate data silos to make them available for sharing according to the FAIR principles. We showed that our method can successfully transform existing health data sets into HL7 FHIR without loss of data utility, and the result is FAIR in terms of the FAIR Data Maturity Model. We support institutional migration to HL7 FHIR, which not only leads to FAIR data sharing but also eases the integration with different research networks. ", doi="10.2196/42822", url="https://www.jmir.org/2023/1/e42822", url="http://www.ncbi.nlm.nih.gov/pubmed/36884270" } @Article{info:doi/10.2196/40312, author="Reinecke, Ines and Siebel, Joscha and Fuhrmann, Saskia and Fischer, Andreas and Sedlmayr, Martin and Weidner, Jens and Bathelt, Franziska", title="Assessment and Improvement of Drug Data Structuredness From Electronic Health Records: Algorithm Development and Validation", journal="JMIR Med Inform", year="2023", month="Jan", day="25", volume="11", pages="e40312", keywords="secondary usage", keywords="Observational Medical Outcomes Partnership", keywords="OMOP", keywords="drug data", keywords="data quality", keywords="Anatomical Therapeutic Chemical", keywords="ATC", keywords="RxNorm", keywords="interoperability", abstract="Background: Digitization offers a multitude of opportunities to gain insights into current diagnostics and therapies from retrospective data. In this context, real-world data and their accessibility are of increasing importance to support unbiased and reliable research on big data. However, routinely collected data are not readily usable for research owing to the unstructured nature of health care systems and a lack of interoperability between these systems. This challenge is evident in drug data. Objective: This study aimed to present an approach that identifies and increases the structuredness of drug data while ensuring standardization according to Anatomical Therapeutic Chemical (ATC) classification. Methods: Our approach was based on available drug prescriptions and a drug catalog and consisted of 4 steps. First, we performed an initial analysis of the structuredness of local drug data to define a point of comparison for the effectiveness of the overall approach. Second, we applied 3 algorithms to unstructured data that translated text into ATC codes based on string comparisons in terms of ingredients and product names and performed similarity comparisons based on Levenshtein distance. Third, we validated the results of the 3 algorithms with expert knowledge based on the 1000 most frequently used prescription texts. Fourth, we performed a final validation to determine the increased degree of structuredness. Results: Initially, 47.73\% (n=843,980) of 1,768,153 drug prescriptions were classified as structured. With the application of the 3 algorithms, we were able to increase the degree of structuredness to 85.18\% (n=1,506,059) based on the 1000 most frequent medication prescriptions. In this regard, the combination of algorithms 1, 2, and 3 resulted in a correctness level of 100\% (with 57,264 ATC codes identified), algorithms 1 and 3 resulted in 99.6\% (with 152,404 codes identified), and algorithms 1 and 2 resulted in 95.9\% (with 39,472 codes identified). Conclusions: As shown in the first analysis steps of our approach, the availability of a product catalog to select during the documentation process is not sufficient to generate structured data. Our 4-step approach reduces the problems and reliably increases the structuredness automatically. Similarity matching shows promising results, particularly for entries with no connection to a product catalog. However, further enhancement of the correctness of such a similarity matching algorithm needs to be investigated in future work. ", doi="10.2196/40312", url="https://medinform.jmir.org/2023/1/e40312", url="http://www.ncbi.nlm.nih.gov/pubmed/36696159" } @Article{info:doi/10.2196/38590, author="Chen, Xiaojie and Chen, Han and Nan, Shan and Kong, Xiangtian and Duan, Huilong and Zhu, Haiyan", title="Dealing With Missing, Imbalanced, and Sparse Features During the Development of a Prediction Model for Sudden Death Using Emergency Medicine Data: Machine Learning Approach", journal="JMIR Med Inform", year="2023", month="Jan", day="20", volume="11", pages="e38590", keywords="emergency medicine", keywords="prediction model", keywords="data preprocessing", keywords="imbalanced data", keywords="missing value interpolation", keywords="sparse features", keywords="clinical informatics", keywords="machine learning", keywords="medical informatics", abstract="Background: In emergency departments (EDs), early diagnosis and timely rescue, which are supported by prediction modes using ED data, can increase patients' chances of survival. Unfortunately, ED data usually contain missing, imbalanced, and sparse features, which makes it challenging to build early identification models for diseases. Objective: This study aims to propose a systematic approach to deal with the problems of missing, imbalanced, and sparse features for developing sudden-death prediction models using emergency medicine (or ED) data. Methods: We proposed a 3-step approach to deal with data quality issues: a random forest (RF) for missing values, k-means for imbalanced data, and principal component analysis (PCA) for sparse features. For continuous and discrete variables, the decision coefficient R2 and the $\kappa$ coefficient were used to evaluate performance, respectively. The area under the receiver operating characteristic curve (AUROC) and the area under the precision-recall curve (AUPRC) were used to estimate the model's performance. To further evaluate the proposed approach, we carried out a case study using an ED data set obtained from the Hainan Hospital of Chinese PLA General Hospital. A logistic regression (LR) prediction model for patient condition worsening was built. Results: A total of 1085 patients with rescue records and 17,959 patients without rescue records were selected and significantly imbalanced. We extracted 275, 402, and 891 variables from laboratory tests, medications, and diagnosis, respectively. After data preprocessing, the median R2 of the RF continuous variable interpolation was 0.623 (IQR 0.647), and the median of the $\kappa$ coefficient for discrete variable interpolation was 0.444 (IQR 0.285). The LR model constructed using the initial diagnostic data showed poor performance and variable separation, which was reflected in the abnormally high odds ratio (OR) values of the 2 variables of cardiac arrest and respiratory arrest (201568034532 and 1211118945, respectively) and an abnormal 95\% CI. Using processed data, the recall of the model reached 0.746, the F1-score was 0.73, and the AUROC was 0.708. Conclusions: The proposed systematic approach is valid for building a prediction model for emergency patients. ", doi="10.2196/38590", url="https://medinform.jmir.org/2023/1/e38590", url="http://www.ncbi.nlm.nih.gov/pubmed/36662548" } @Article{info:doi/10.2196/42401, author="van Kessel, Robin and Kyriopoulos, Ilias and Wong, Han Brian Li and Mossialos, Elias", title="The Effect of the COVID-19 Pandemic on Digital Health--Seeking Behavior: Big Data Interrupted Time-Series Analysis of Google Trends", journal="J Med Internet Res", year="2023", month="Jan", day="16", volume="25", pages="e42401", keywords="digital health", keywords="healthcare seeking behaviour", keywords="big data", keywords="real-world data", keywords="data", keywords="COVID-19", keywords="pandemic", keywords="Google Trends", keywords="telehealth", abstract="Background: Due to the emergency responses early in the COVID-19 pandemic, the use of digital health in health care increased abruptly. However, it remains unclear whether this introduction was sustained in the long term, especially with patients being able to decide between digital and traditional health services once the latter regained their functionality throughout the COVID-19 pandemic. Objective: We aim to understand how the public interest in digital health changed as proxy for digital health--seeking behavior and to what extent this change was sustainable over time. Methods: We used an interrupted time-series analysis of Google Trends data with break points on March 11, 2020 (declaration of COVID-19 as a pandemic by the World Health Organization), and December 20, 2020 (the announcement of the first COVID-19 vaccines). Nationally representative time-series data from February 2019 to August 2021 were extracted from Google Trends for 6 countries with English as their dominant language: Canada, the United States, the United Kingdom, New Zealand, Australia, and Ireland. We measured the changes in relative search volumes of the keywords online doctor, telehealth, online health, telemedicine, and health app. In doing so, we capture the prepandemic trend, the immediate change due to the announcement of COVID-19 being a pandemic, and the gradual change after the announcement. Results: Digital health search volumes immediately increased in all countries under study after the announcement of COVID-19 being a pandemic. There was some variation in what keywords were used per country. However, searches declined after this immediate spike, sometimes reverting to prepandemic levels. The announcement of COVID-19 vaccines did not consistently impact digital health search volumes in the countries under study. The exception is the search volume of health app, which was observed as either being stable or gradually increasing during the pandemic. Conclusions: Our findings suggest that the increased public interest in digital health associated with the pandemic did not sustain, alluding to remaining structural barriers. Further building of digital health capacity and developing robust digital health governance frameworks remain crucial to facilitating sustainable digital health transformation. ", doi="10.2196/42401", url="https://www.jmir.org/2023/1/e42401", url="http://www.ncbi.nlm.nih.gov/pubmed/36603152" } @Article{info:doi/10.2196/38922, author="Miyaji, Atsuko and Watanabe, Kaname and Takano, Yuuki and Nakasho, Kazuhisa and Nakamura, Sho and Wang, Yuntao and Narimatsu, Hiroto", title="A Privacy-Preserving Distributed Medical Data Integration Security System for Accuracy Assessment of Cancer Screening: Development Study of Novel Data Integration System", journal="JMIR Med Inform", year="2022", month="Dec", day="30", volume="10", number="12", pages="e38922", keywords="data linkage", keywords="data security", keywords="secure data integration", keywords="privacy-preserving linkage", keywords="secure matching privacy-preserving linkage", keywords="private set intersection", keywords="PSI", keywords="privacy-preserving distributed data integration", keywords="PDDI", keywords="big data", keywords="medical informatics", keywords="cancer prevention", keywords="cancer epidemiology", keywords="epidemiological survey", abstract="Background: Big data useful for epidemiological research can be obtained by integrating data corresponding to individuals between databases managed by different institutions. Privacy information must be protected while performing efficient, high-level data matching. Objective: Privacy-preserving distributed data integration (PDDI) enables data matching between multiple databases without moving privacy information; however, its actual implementation requires matching security, accuracy, and performance. Moreover, identifying the optimal data item in the absence of a unique matching key is necessary. We aimed to conduct a basic matching experiment using a model to assess the accuracy of cancer screening. Methods: To experiment with actual data, we created a data set mimicking the cancer screening and registration data in Japan and conducted a matching experiment using a PDDI system between geographically distant institutions. Errors similar to those found empirically in data sets recorded in Japanese were artificially introduced into the data set. The matching-key error rate of the data common to both data sets was set sufficiently higher than expected in the actual database: 85.0\% and 59.0\% for the data simulating colorectal and breast cancers, respectively. Various combinations of name, gender, date of birth, and address were used for the matching key. To evaluate the matching accuracy, the matching sensitivity and specificity were calculated based on the number of cancer-screening data points, and the effect of matching accuracy on the sensitivity and specificity of cancer screening was estimated based on the obtained values. To evaluate the performance, we measured central processing unit use, memory use, and network traffic. Results: For combinations with a specificity ?99\% and high sensitivity, the date of birth and first name were used in the data simulating colorectal cancer, and the matching sensitivity and specificity were 55.00\% and 99.85\%, respectively. In the data simulating breast cancer, the date of birth and family name were used, and the matching sensitivity and specificity were 88.71\% and 99.98\%, respectively. Assuming the sensitivity and specificity of cancer screening at 90\%, the apparent values decreased to 74.90\% and 89.93\%, respectively. A trial calculation was performed using a combination with the same data set and 100\% specificity. When the matching sensitivity was 82.26\%, the apparent screening sensitivity was maintained at 90\%, and the screening specificity decreased to 89.89\%. For 214 data points, the execution time was 82 minutes and 26 seconds without parallelization and 11 minutes and 38 seconds with parallelization; 19.33\% of the calculation time was for the data-holding institutions. Memory use was 3.4 GB for the PDDI server and 2.7 GB for the data-holding institutions. Conclusions: We demonstrated the rudimentary feasibility of introducing a PDDI system for cancer-screening accuracy assessment. We plan to conduct matching experiments based on actual data and compare them with the existing methods. ", doi="10.2196/38922", url="https://medinform.jmir.org/2022/12/e38922", url="http://www.ncbi.nlm.nih.gov/pubmed/36583931" } @Article{info:doi/10.2196/40743, author="Jin, Qiao and Tan, Chuanqi and Chen, Mosha and Yan, Ming and Zhang, Ningyu and Huang, Songfang and Liu, Xiaozhong", title="State-of-the-Art Evidence Retriever for Precision Medicine: Algorithm Development and Validation", journal="JMIR Med Inform", year="2022", month="Dec", day="15", volume="10", number="12", pages="e40743", keywords="precision medicine", keywords="evidence-based medicine", keywords="information retrieval", keywords="active learning", keywords="pretrained language models", keywords="digital health intervention", keywords="data retrieval", keywords="big data", keywords="algorithm development", abstract="Background: Under the paradigm of precision medicine (PM), patients with the same disease can receive different personalized therapies according to their clinical and genetic features. These therapies are determined by the totality of all available clinical evidence, including results from case reports, clinical trials, and systematic reviews. However, it is increasingly difficult for physicians to find such evidence from scientific publications, whose size is growing at an unprecedented pace. Objective: In this work, we propose the PM-Search system to facilitate the retrieval of clinical literature that contains critical evidence for or against giving specific therapies to certain cancer patients. Methods: The PM-Search system combines a baseline retriever that selects document candidates at a large scale and an evidence reranker that finely reorders the candidates based on their evidence quality. The baseline retriever uses query expansion and keyword matching with the ElasticSearch retrieval engine, and the evidence reranker fits pretrained language models to expert annotations that are derived from an active learning strategy. Results: The PM-Search system achieved the best performance in the retrieval of high-quality clinical evidence at the Text Retrieval Conference PM Track 2020, outperforming the second-ranking systems by large margins (0.4780 vs 0.4238 for standard normalized discounted cumulative gain at rank 30 and 0.4519 vs 0.4193 for exponential normalized discounted cumulative gain at rank 30). Conclusions: We present PM-Search, a state-of-the-art search engine to assist the practicing of evidence-based PM. PM-Search uses a novel Bidirectional Encoder Representations from Transformers for Biomedical Text Mining--based active learning strategy that models evidence quality and improves the model performance. Our analyses show that evidence quality is a distinct aspect from general relevance, and specific modeling of evidence quality beyond general relevance is required for a PM search engine. ", doi="10.2196/40743", url="https://medinform.jmir.org/2022/12/e40743", url="http://www.ncbi.nlm.nih.gov/pubmed/36409468" } @Article{info:doi/10.2196/38232, author="Teodorowski, Piotr and Rodgers, E. Sarah and Fleming, Kate and Frith, Lucy", title="Use of the Hashtag \#DataSavesLives on Twitter: Exploratory and Thematic Analysis", journal="J Med Internet Res", year="2022", month="Nov", day="15", volume="24", number="11", pages="e38232", keywords="consumer involvement", keywords="patient participation", keywords="stakeholder participation", keywords="social media", keywords="public engagement", keywords="campaign", keywords="big data", keywords="research", keywords="trust", keywords="tweets", keywords="Twitter", keywords="perception", keywords="usage", keywords="users", keywords="data sharing", keywords="ethics", keywords="community", keywords="hashtag", abstract="Background: ``Data Saves Lives'' is a public engagement campaign that highlights the benefits of big data research and aims to establish public trust for this emerging research area. Objective: This study explores how the hashtag \#DataSavesLives is used on Twitter. We focused on the period when the UK government and its agencies adopted \#DataSavesLives in an attempt to support their plans to set up a new database holding National Health Service (NHS) users' medical data. Methods: Public tweets published between April 19 and July 15, 2021, using the hashtag \#DataSavesLives were saved using NCapture for NVivo 12. All tweets were coded twice. First, each tweet was assigned a positive, neutral, or negative attitude toward the campaign. Second, inductive thematic analysis was conducted. The results of the thematic analysis were mapped under 3 models of public engagement: deficit, dialogue, and participatory. Results: Of 1026 unique tweets available for qualitative analysis, discussion around \#DataSavesLives was largely positive (n=716, 69.8\%) or neutral (n=276, 26.9\%) toward the campaign with limited negative attitudes (n=34, 3.3\%). Themes derived from the \#DataSavesLives debate included ethical sharing, proactively engaging the public, coproducing knowledge with the public, harnessing potential, and gaining an understanding of big data research. The Twitter discourse was largely positive toward the campaign. The hashtag is predominantly used by similar-minded Twitter users to share information about big data projects and to spread positive messages about big data research when there are public controversies. The hashtag is generally used by organizations and people supportive of big data research. Tweet authors recognize that the public should be proactively engaged and involved in big data projects. The campaign remains UK centric. The results indicate that the communication around big data research is driven by the professional community and remains 1-way as members of the public rarely use the hashtag. Conclusions: The results demonstrate the potential of social media but draws attention to hashtag usage being generally confined to ``Twitter bubbles'': groups of similar-minded Twitter users. ", doi="10.2196/38232", url="https://www.jmir.org/2022/11/e38232", url="http://www.ncbi.nlm.nih.gov/pubmed/36378518" } @Article{info:doi/10.2196/35622, author="Higa, Eduardo and Elb{\'e}ji, Abir and Zhang, Lu and Fischer, Aur{\'e}lie and Aguayo, A. Gloria and Nazarov, V. Petr and Fagherazzi, Guy", title="Discovery and Analytical Validation of a Vocal Biomarker to Monitor Anosmia and Ageusia in Patients With COVID-19: Cross-sectional Study", journal="JMIR Med Inform", year="2022", month="Nov", day="8", volume="10", number="11", pages="e35622", keywords="vocal biomarker", keywords="COVID-19", keywords="ageusia", keywords="anosmia", keywords="loss of smell", keywords="loss of taste", keywords="digital assessment tool", keywords="digital health", keywords="medical informatics", keywords="telehealth", keywords="telemonitoring", keywords="biomarker", keywords="pandemic", keywords="symptoms", keywords="tool", keywords="disease", keywords="noninvasive", keywords="AI", keywords="artificial intelligence", keywords="digital", keywords="device", abstract="Background: The COVID-19 disease has multiple symptoms, with anosmia and ageusia being the most prevalent, varying from 75\% to 95\% and from 50\% to 80\% of infected patients, respectively. An automatic assessment tool for these symptoms will help monitor the disease in a fast and noninvasive manner. Objective: We hypothesized that people with COVID-19 experiencing anosmia and ageusia had different voice features than those without such symptoms. Our objective was to develop an artificial intelligence pipeline to identify and internally validate a vocal biomarker of these symptoms for remotely monitoring them. Methods: This study used population-based data. Participants were assessed daily through a web-based questionnaire and asked to register 2 different types of voice recordings. They were adults (aged >18 years) who were confirmed by a polymerase chain reaction test to be positive for COVID-19 in Luxembourg and met the inclusion criteria. Statistical methods such as recursive feature elimination for dimensionality reduction, multiple statistical learning methods, and hypothesis tests were used throughout this study. The TRIPOD (Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis) Prediction Model Development checklist was used to structure the research. Results: This study included 259 participants. Younger (aged <35 years) and female participants showed higher rates of ageusia and anosmia. Participants were aged 41 (SD 13) years on average, and the data set was balanced for sex (female: 134/259, 51.7\%; male: 125/259, 48.3\%). The analyzed symptom was present in 94 (36.3\%) out of 259 participants and in 450 (27.5\%) out of 1636 audio recordings. In all, 2 machine learning models were built, one for Android and one for iOS devices, and both had high accuracy---88\% for Android and 85\% for iOS. The final biomarker was then calculated using these models and internally validated. Conclusions: This study demonstrates that people with COVID-19 who have anosmia and ageusia have different voice features from those without these symptoms. Upon further validation, these vocal biomarkers could be nested in digital devices to improve symptom assessment in clinical practice and enhance the telemonitoring of COVID-19--related symptoms. Trial Registration: Clinicaltrials.gov NCT04380987; https://clinicaltrials.gov/ct2/show/NCT04380987 ", doi="10.2196/35622", url="https://medinform.jmir.org/2022/11/e35622", url="http://www.ncbi.nlm.nih.gov/pubmed/36265042" } @Article{info:doi/10.2196/35138, author="Qin, Jiaxin and Wu, Jian", title="Realizing the Potential of Computer-Assisted Surgery by Embedding Digital Twin Technology", journal="JMIR Med Inform", year="2022", month="Nov", day="8", volume="10", number="11", pages="e35138", keywords="computer-assisted surgery", keywords="digital twin", keywords="virtual space", keywords="surgical navigation", keywords="remote surgery", doi="10.2196/35138", url="https://medinform.jmir.org/2022/11/e35138", url="http://www.ncbi.nlm.nih.gov/pubmed/36346669" } @Article{info:doi/10.2196/36711, author="Guardiolle, Vianney and Bazoge, Adrien and Morin, Emmanuel and Daille, B{\'e}atrice and Toublant, Delphine and Bouzill{\'e}, Guillaume and Merel, Youenn and Pierre-Jean, Morgane and Filiot, Alexandre and Cuggia, Marc and Wargny, Matthieu and Lamer, Antoine and Gourraud, Pierre-Antoine", title="Linking Biomedical Data Warehouse Records With the National Mortality Database in France: Large-scale Matching Algorithm", journal="JMIR Med Inform", year="2022", month="Nov", day="1", volume="10", number="11", pages="e36711", keywords="data warehousing", keywords="clinical data warehouse", keywords="medical informatics applications", keywords="medical record linkage", keywords="French National Mortality Database", keywords="data reuse", keywords="open data, R", keywords="clinical informatics", abstract="Background: Often missing from or uncertain in a biomedical data warehouse (BDW), vital status after discharge is central to the value of a BDW in medical research. The French National Mortality Database (FNMD) offers open-source nominative records of every death. Matching large-scale BDWs records with the FNMD combines multiple challenges: absence of unique common identifiers between the 2 databases, names changing over life, clerical errors, and the exponential growth of the number of comparisons to compute. Objective: We aimed to develop a new algorithm for matching BDW records to the FNMD and evaluated its performance. Methods: We developed a deterministic algorithm based on advanced data cleaning and knowledge of the naming system and the Damerau-Levenshtein distance (DLD). The algorithm's performance was independently assessed using BDW data of 3 university hospitals: Lille, Nantes, and Rennes. Specificity was evaluated with living patients on January 1, 2016 (ie, patients with at least 1 hospital encounter before and after this date). Sensitivity was evaluated with patients recorded as deceased between January 1, 2001, and December 31, 2020. The DLD-based algorithm was compared to a direct matching algorithm with minimal data cleaning as a reference. Results: All centers combined, sensitivity was 11\% higher for the DLD-based algorithm (93.3\%, 95\% CI 92.8-93.9) than for the direct algorithm (82.7\%, 95\% CI 81.8-83.6; P<.001). Sensitivity was superior for men at 2 centers (Nantes: 87\%, 95\% CI 85.1-89 vs 83.6\%, 95\% CI 81.4-85.8; P=.006; Rennes: 98.6\%, 95\% CI 98.1-99.2 vs 96\%, 95\% CI 94.9-97.1; P<.001) and for patients born in France at all centers (Nantes: 85.8\%, 95\% CI 84.3-87.3 vs 74.9\%, 95\% CI 72.8-77.0; P<.001). The DLD-based algorithm revealed significant differences in sensitivity among centers (Nantes, 85.3\% vs Lille and Rennes, 97.3\%, P<.001). Specificity was >98\% in all subgroups. Our algorithm matched tens of millions of death records from BDWs, with parallel computing capabilities and low RAM requirements. We used the Inseehop open-source R script for this measurement. Conclusions: Overall, sensitivity/recall was 11\% higher using the DLD-based algorithm than that using the direct algorithm. This shows the importance of advanced data cleaning and knowledge of a naming system through DLD use. Statistically significant differences in sensitivity between groups could be found and must be considered when performing an analysis to avoid differential biases. Our algorithm, originally conceived for linking a BDW with the FNMD, can be used to match any large-scale databases. While matching operations using names are considered sensitive computational operations, the Inseehop package released here is easy to run on premises, thereby facilitating compliance with cybersecurity local framework. The use of an advanced deterministic matching algorithm such as the DLD-based algorithm is an insightful example of combining open-source external data to improve the usage value of BDWs. ", doi="10.2196/36711", url="https://medinform.jmir.org/2022/11/e36711", url="http://www.ncbi.nlm.nih.gov/pubmed/36318244" } @Article{info:doi/10.2196/37437, author="Scheenstra, Bart and Bruninx, Anke and van Daalen, Florian and Stahl, Nina and Latuapon, Elizabeth and Imkamp, Maike and Ippel, Lianne and Duijsings-Mahangi, Sulaika and Smits, Djura and Townend, David and Bermejo, Inigo and Dekker, Andre and Hochstenbach, Laura and Spreeuwenberg, Marieke and Maessen, Jos and van 't Hof, Arnoud and Kietselaer, Bas", title="Digital Health Solutions to Reduce the Burden of Atherosclerotic Cardiovascular Disease Proposed by the CARRIER Consortium", journal="JMIR Cardio", year="2022", month="Oct", day="17", volume="6", number="2", pages="e37437", keywords="atherosclerotic cardiovascular disease", keywords="ASCVD", keywords="cardiovascular risk management", keywords="CVRM", keywords="eHealth", keywords="digital Health", keywords="personalized e-coach", keywords="big data", keywords="clinical prediction models", keywords="federated data infrastructure", doi="10.2196/37437", url="https://cardio.jmir.org/2022/2/e37437", url="http://www.ncbi.nlm.nih.gov/pubmed/36251353" } @Article{info:doi/10.2196/37951, author="Kurasawa, Hisashi and Waki, Kayo and Chiba, Akihiro and Seki, Tomohisa and Hayashi, Katsuyoshi and Fujino, Akinori and Haga, Tsuneyuki and Noguchi, Takashi and Ohe, Kazuhiko", title="Treatment Discontinuation Prediction in Patients With Diabetes Using a Ranking Model: Machine Learning Model Development", journal="JMIR Bioinform Biotech", year="2022", month="Sep", day="23", volume="3", number="1", pages="e37951", keywords="machine learning", keywords="machine-learned ranking model", keywords="treatment discontinuation", keywords="diabetes", keywords="prediction", keywords="electronic health record", keywords="EHR", keywords="big data", keywords="ranking", keywords="algorithm", abstract="Background: Treatment discontinuation (TD) is one of the major prognostic issues in diabetes care, and several models have been proposed to predict a missed appointment that may lead to TD in patients with diabetes by using binary classification models for the early detection of TD and for providing intervention support for patients. However, as binary classification models output the probability of a missed appointment occurring within a predetermined period, they are limited in their ability to estimate the magnitude of TD risk in patients with inconsistent intervals between appointments, making it difficult to prioritize patients for whom intervention support should be provided. Objective: This study aimed to develop a machine-learned prediction model that can output a TD risk score defined by the length of time until TD and prioritize patients for intervention according to their TD risk. Methods: This model included patients with diagnostic codes indicative of diabetes at the University of Tokyo Hospital between September 3, 2012, and May 17, 2014. The model was internally validated with patients from the same hospital from May 18, 2014, to January 29, 2016. The data used in this study included 7551 patients who visited the hospital after January 1, 2004, and had diagnostic codes indicative of diabetes. In particular, data that were recorded in the electronic medical records between September 3, 2012, and January 29, 2016, were used. The main outcome was the TD of a patient, which was defined as missing a scheduled clinical appointment and having no hospital visits within 3 times the average number of days between the visits of the patient and within 60 days. The TD risk score was calculated by using the parameters derived from the machine-learned ranking model. The prediction capacity was evaluated by using test data with the C-index for the performance of ranking patients, area under the receiver operating characteristic curve, and area under the precision-recall curve for discrimination, in addition to a calibration plot. Results: The means (95\% confidence limits) of the C-index, area under the receiver operating characteristic curve, and area under the precision-recall curve for the TD risk score were 0.749 (0.655, 0.823), 0.758 (0.649, 0.857), and 0.713 (0.554, 0.841), respectively. The observed and predicted probabilities were correlated with the calibration plots. Conclusions: A TD risk score was developed for patients with diabetes by combining a machine-learned method with electronic medical records. The score calculation can be integrated into medical records to identify patients at high risk of TD, which would be useful in supporting diabetes care and preventing TD. ", doi="10.2196/37951", url="https://bioinform.jmir.org/2022/1/e37951" } @Article{info:doi/10.2196/36797, author="Muller, A. Sam H. and van Thiel, W. Ghislaine J. M. and Vrana, Marilena and Mostert, Menno and van Delden, M. Johannes J.", title="Patients' and Publics' Preferences for Data-Intensive Health Research Governance: Survey Study", journal="JMIR Hum Factors", year="2022", month="Sep", day="7", volume="9", number="3", pages="e36797", keywords="data-intensive health research", keywords="big data", keywords="data sharing", keywords="patient and public preferences", keywords="health data sharing conditions", keywords="ethics", keywords="governance", keywords="policy", keywords="patient and public involvement", keywords="research participants", keywords="trust", abstract="Background: Patients and publics are generally positive about data-intensive health research. However, conditions need to be fulfilled for their support. Ensuring confidentiality, security, and privacy of patients' health data is pivotal. Patients and publics have concerns about secondary use of data by commercial parties and the risk of data misuse, reasons for which they favor personal control of their data. Yet, the potential of public benefit highlights the potential of building trust to attenuate these perceptions of harm and risk. Nevertheless, empirical evidence on how conditions for support of data-intensive health research can be operationalized to that end remains scant. Objective: This study aims to inform efforts to design governance frameworks for data-intensive health research, by gaining insight into the preferences of patients and publics for governance policies and measures. Methods: We distributed a digital questionnaire among a purposive sample of patients and publics. Data were analyzed using descriptive statistics and nonparametric inferential statistics to compare group differences and explore associations between policy preferences. Results: Study participants (N=987) strongly favored sharing their health data for scientific health research. Personal decision-making about which research projects health data are shared with (346/980, 35.3\%), which researchers/organizations can have access (380/978, 38.9\%), and the provision of information (458/981, 46.7\%) were found highly important. Health data--sharing policies strengthening direct personal control, like being able to decide under which conditions health data are shared (538/969, 55.5\%), were found highly important. Policies strengthening collective governance, like reliability checks (805/967, 83.2\%) and security safeguards (787/976, 80.6\%), were also found highly important. Further analysis revealed that participants willing to share health data, to a lesser extent, demanded policies strengthening direct personal control than participants who were reluctant to share health data. This was the case for the option to have health data deleted at any time (P<.001) and the ability to decide the conditions under which health data can be shared (P<.001). Overall, policies and measures enforcing conditions for support at the collective level of governance, like having an independent committee to evaluate requests for access to health data (P=.02), were most strongly favored. This also applied to participants who explicitly stressed that it was important to be able to decide the conditions under which health data can be shared, for instance, whether sanctions on data misuse are in place (P=.03). Conclusions: This study revealed that both a positive attitude toward health data sharing and demand for personal decision-making abilities were associated with policies and measures strengthening control at the collective level of governance. We recommend pursuing the development of this type of governance policy. More importantly, further study is required to understand how governance policies and measures can contribute to the trustworthiness of data-intensive health research. ", doi="10.2196/36797", url="https://humanfactors.jmir.org/2022/3/e36797", url="http://www.ncbi.nlm.nih.gov/pubmed/36069794" } @Article{info:doi/10.2196/39057, author="Kiser, C. Amber and Eilbeck, Karen and Ferraro, P. Jeffrey and Skarda, E. David and Samore, H. Matthew and Bucher, Brian", title="Standard Vocabularies to Improve Machine Learning Model Transferability With Electronic Health Record Data: Retrospective Cohort Study Using Health Care--Associated Infection", journal="JMIR Med Inform", year="2022", month="Aug", day="30", volume="10", number="8", pages="e39057", keywords="standard vocabularies", keywords="machine learning", keywords="electronic health records", keywords="model transferability", keywords="data heterogeneity", abstract="Background: With the widespread adoption of electronic healthcare records (EHRs) by US hospitals, there is an opportunity to leverage this data for the development of predictive algorithms to improve clinical care. A key barrier in model development and implementation includes the external validation of model discrimination, which is rare and often results in worse performance. One reason why machine learning models are not externally generalizable is data heterogeneity. A potential solution to address the substantial data heterogeneity between health care systems is to use standard vocabularies to map EHR data elements. The advantage of these vocabularies is a hierarchical relationship between elements, which allows the aggregation of specific clinical features to more general grouped concepts. Objective: This study aimed to evaluate grouping EHR data using standard vocabularies to improve the transferability of machine learning models for the detection of postoperative health care--associated infections across institutions with different EHR systems. Methods: Patients who underwent surgery from the University of Utah Health and Intermountain Healthcare from July 2014 to August 2017 with complete follow-up data were included. The primary outcome was a health care--associated infection within 30 days of the procedure. EHR data from 0-30 days after the operation were mapped to standard vocabularies and grouped using the hierarchical relationships of the vocabularies. Model performance was measured using the area under the receiver operating characteristic curve (AUC) and F1-score in internal and external validations. To evaluate model transferability, a difference-in-difference metric was defined as the difference in performance drop between internal and external validations for the baseline and grouped models. Results: A total of 5775 patients from the University of Utah and 15,434 patients from Intermountain Healthcare were included. The prevalence of selected outcomes was from 4.9\% (761/15,434) to 5\% (291/5775) for surgical site infections, from 0.8\% (44/5775) to 1.1\% (171/15,434) for pneumonia, from 2.6\% (400/15,434) to 3\% (175/5775) for sepsis, and from 0.8\% (125/15,434) to 0.9\% (50/5775) for urinary tract infections. In all outcomes, the grouping of data using standard vocabularies resulted in a reduced drop in AUC and F1-score in external validation compared to baseline features (all P<.001, except urinary tract infection AUC: P=.002). The difference-in-difference metrics ranged from 0.005 to 0.248 for AUC and from 0.075 to 0.216 for F1-score. Conclusions: We demonstrated that grouping machine learning model features based on standard vocabularies improved model transferability between data sets across 2 institutions. Improving model transferability using standard vocabularies has the potential to improve the generalization of clinical prediction models across the health care system. ", doi="10.2196/39057", url="https://medinform.jmir.org/2022/8/e39057", url="http://www.ncbi.nlm.nih.gov/pubmed/36040784" } @Article{info:doi/10.2196/38154, author="Wang, Peng and Li, Yong and Yang, Liang and Li, Simin and Li, Linfeng and Zhao, Zehan and Long, Shaopei and Wang, Fei and Wang, Hongqian and Li, Ying and Wang, Chengliang", title="An Efficient Method for Deidentifying Protected Health Information in Chinese Electronic Health Records: Algorithm Development and Validation", journal="JMIR Med Inform", year="2022", month="Aug", day="30", volume="10", number="8", pages="e38154", keywords="EHR", keywords="PHI", keywords="personal information", keywords="protected data", keywords="protected information", keywords="patient information", keywords="health information", keywords="de-identification", keywords="de-identify", keywords="privacy", keywords="TinyBert", keywords="model", keywords="development", keywords="algorithm", keywords="machine learning", keywords="CRF", keywords="data augmentation", keywords="health record", keywords="medical record", abstract="Background: With the popularization of electronic health records in China, the utilization of digitalized data has great potential for the development of real-world medical research. However, the data usually contains a great deal of protected health information and the direct usage of this data may cause privacy issues. The task of deidentifying protected health information in electronic health records can be regarded as a named entity recognition problem. Existing rule-based, machine learning--based, or deep learning--based methods have been proposed to solve this problem. However, these methods still face the difficulties of insufficient Chinese electronic health record data and the complex features of the Chinese language. Objective: This paper proposes a method to overcome the difficulties of overfitting and a lack of training data for deep neural networks to enable Chinese protected health information deidentification. Methods: We propose a new model that merges TinyBERT (bidirectional encoder representations from transformers) as a text feature extraction module and the conditional random field method as a prediction module for deidentifying protected health information in Chinese medical electronic health records. In addition, a hybrid data augmentation method that integrates a sentence generation strategy and a mention-replacement strategy is proposed for overcoming insufficient Chinese electronic health records. Results: We compare our method with 5 baseline methods that utilize different BERT models as their feature extraction modules. Experimental results on the Chinese electronic health records that we collected demonstrate that our method had better performance (microprecision: 98.7\%, microrecall: 99.13\%, and micro-F1 score: 98.91\%) and higher efficiency (40\% faster) than all the BERT-based baseline methods. Conclusions: Compared to baseline methods, the efficiency advantage of TinyBERT on our proposed augmented data set was kept while the performance improved for the task of Chinese protected health information deidentification. ", doi="10.2196/38154", url="https://medinform.jmir.org/2022/8/e38154", url="http://www.ncbi.nlm.nih.gov/pubmed/36040774" } @Article{info:doi/10.2196/37379, author="Gore, Ross and Lynch, J. Christopher and Jordan, A. Craig and Collins, Andrew and Robinson, Michael R. and Fuller, Gabrielle and Ames, Pearson and Keerthi, Prateek and Kandukuri, Yash", title="Estimating the Health Effects of Adding Bicycle and Pedestrian Paths at the Census Tract Level: Multiple Model Comparison", journal="JMIR Public Health Surveill", year="2022", month="Aug", day="24", volume="8", number="8", pages="e37379", keywords="bicycle paths", keywords="pedestrian paths", keywords="bicycling", keywords="walking", keywords="diabetes", keywords="high blood pressure", keywords="physical health", keywords="factor analysis", keywords="digital neighborhoods", keywords="data analysis", abstract="Background: Adding additional bicycle and pedestrian paths to an area can lead to improved health outcomes for residents over time. However, quantitatively determining which areas benefit more from bicycle and pedestrian paths, how many miles of bicycle and pedestrian paths are needed, and the health outcomes that may be most improved remain open questions. Objective: Our work provides and evaluates a methodology that offers actionable insight for city-level planners, public health officials, and decision makers tasked with the question ``To what extent will adding specified bicycle and pedestrian path mileage to a census tract improve residents' health outcomes over time?'' Methods: We conducted a factor analysis of data from the American Community Survey, Center for Disease Control 500 Cities project, Strava, and bicycle and pedestrian path location and use data from two different cities (Norfolk, Virginia, and San Francisco, California). We constructed 2 city-specific factor models and used an algorithm to predict the expected mean improvement that a specified number of bicycle and pedestrian path miles contributes to the identified health outcomes. Results: We show that given a factor model constructed from data from 2011 to 2015, the number of additional bicycle and pedestrian path miles in 2016, and a specific census tract, our models forecast health outcome improvements in 2020 more accurately than 2 alternative approaches for both Norfolk, Virginia, and San Francisco, California. Furthermore, for each city, we show that the additional accuracy is a statistically significant improvement (P<.001 in every case) when compared with the alternate approaches. For Norfolk, Virginia (n=31 census tracts), our approach estimated, on average, the percentage of individuals with high blood pressure in the census tract within 1.49\% (SD 0.85\%), the percentage of individuals with diabetes in the census tract within 1.63\% (SD 0.59\%), and the percentage of individuals who had >2 weeks of poor physical health days in the census tract within 1.83\% (SD 0.57\%). For San Francisco (n=49 census tracts), our approach estimates, on average, that the percentage of individuals who had a stroke in the census tract is within 1.81\% (SD 0.52\%), and the percentage of individuals with diabetes in the census tract is within 1.26\% (SD 0.91\%). Conclusions: We propose and evaluate a methodology to enable decision makers to weigh the extent to which 2 bicycle and pedestrian paths of equal cost, which were proposed in different census tracts, improve residents' health outcomes; identify areas where bicycle and pedestrian paths are unlikely to be effective interventions and other strategies should be used; and quantify the minimum amount of additional bicycle path miles needed to maximize health outcome improvements. Our methodology shows statistically significant improvements, compared with alternative approaches, in historical accuracy for 2 large cities (for 2016) within different geographic areas and with different demographics. ", doi="10.2196/37379", url="https://publichealth.jmir.org/2022/8/e37379", url="http://www.ncbi.nlm.nih.gov/pubmed/36001362" } @Article{info:doi/10.2196/38122, author="Noor, Kawsar and Roguski, Lukasz and Bai, Xi and Handy, Alex and Klapaukh, Roman and Folarin, Amos and Romao, Luis and Matteson, Joshua and Lea, Nathan and Zhu, Leilei and Asselbergs, W. Folkert and Wong, Keong Wai and Shah, Anoop and Dobson, JB Richard", title="Deployment of a Free-Text Analytics Platform at a UK National Health Service Research Hospital: CogStack at University College London Hospitals", journal="JMIR Med Inform", year="2022", month="Aug", day="24", volume="10", number="8", pages="e38122", keywords="natural language processing", keywords="text mining", keywords="information retrieval", keywords="electronic health record system", keywords="clinical support", abstract="Background: As more health care organizations transition to using electronic health record (EHR) systems, it is important for these organizations to maximize the secondary use of their data to support service improvement and clinical research. These organizations will find it challenging to have systems capable of harnessing the unstructured data fields in the record (clinical notes, letters, etc) and more practically have such systems interact with all of the hospital data systems (legacy and current). Objective: We describe the deployment of the EHR interfacing information extraction and retrieval platform CogStack at University College London Hospitals (UCLH). Methods: At UCLH, we have deployed the CogStack platform, an information retrieval platform with natural language processing capabilities. The platform addresses the problem of data ingestion and harmonization from multiple data sources using the Apache NiFi module for managing complex data flows. The platform also facilitates the extraction of structured data from free-text records through use of the MedCAT natural language processing library. Finally, data science tools are made available to support data scientists and the development of downstream applications dependent upon data ingested and analyzed by CogStack. Results: The platform has been deployed at the hospital, and in particular, it has facilitated a number of research and service evaluation projects. To date, we have processed over 30 million records, and the insights produced from CogStack have informed a number of clinical research use cases at the hospital. Conclusions: The CogStack platform can be configured to handle the data ingestion and harmonization challenges faced by a hospital. More importantly, the platform enables the hospital to unlock important clinical information from the unstructured portion of the record using natural language processing technology. ", doi="10.2196/38122", url="https://medinform.jmir.org/2022/8/e38122", url="http://www.ncbi.nlm.nih.gov/pubmed/36001371" } @Article{info:doi/10.2196/37622, author="Park, Yoonseo and Park, Sewon and Lee, Munjae", title="Digital Health Care Industry Ecosystem: Network Analysis", journal="J Med Internet Res", year="2022", month="Aug", day="17", volume="24", number="8", pages="e37622", keywords="digital health care", keywords="industrial ecosystem", keywords="network analysis", keywords="topic modeling", keywords="South Korea", abstract="Background: As the need for digital health care based on mobile devices is increasing, with the rapid development of digital technologies, especially in the face of the COVID-19 pandemic, gaining a better understanding of the industrial structure is needed to activate the use of digital health care. Objective: The aim of this study was to suggest measures to revitalize the digital health care industry by deriving the stakeholders and major issues with respect to the ecosystem of the industry. Methods: A total of 1822 newspaper articles were collected using Big Kings, a big data system for news, for a limited period from 2016 to August 2021, when the mobile health care project was promoted in Korea centered on public health centers. The R and NetMiner programs were used for network analysis. Results: The Korean government and the Ministry of Health and Welfare showed the highest centrality and appeared as major stakeholders, and their common major issues were ``reviewing the introduction of telemedicine,'' ``concerns about bankruptcy of local clinics,'' and ``building an integrated platform for precision medicine.'' In addition, the major stakeholders of medical institutions and companies were Seoul National University Hospital, Kangbuk Samsung Hospital, Ajou University Hospital, Samsung, and Vuno Inc. Conclusions: This analysis confirmed that the issues related to digital health care are largely composed of telemedicine, data, and health care business. For digital health care to develop as a national innovative growth engine and to be institutionalized, the development of a digital health care fee model that can improve the regulatory system and the cost-effectiveness of patient care, centering on the Ministry of Health and Welfare as a key stakeholder, is essential. ", doi="10.2196/37622", url="https://www.jmir.org/2022/8/e37622", url="http://www.ncbi.nlm.nih.gov/pubmed/35976690" } @Article{info:doi/10.2196/37584, author="Rose, Christian and D{\'i}az, Mark and D{\'i}az, Tom{\'a}s", title="Addressing Medicine's Dark Matter", journal="Interact J Med Res", year="2022", month="Aug", day="17", volume="11", number="2", pages="e37584", keywords="big data", keywords="AI", keywords="artificial intelligence", keywords="equity", keywords="data collection", keywords="health care", keywords="prediction", keywords="model", keywords="predict", keywords="representative", keywords="unrepresented", doi="10.2196/37584", url="https://www.i-jmr.org/2022/2/e37584", url="http://www.ncbi.nlm.nih.gov/pubmed/35976194" } @Article{info:doi/10.2196/37756, author="Krzyzanowski, Brittany and Manson, M. Steven", title="Twenty Years of the Health Insurance Portability and Accountability Act Safe Harbor Provision: Unsolved Challenges and Ways Forward", journal="JMIR Med Inform", year="2022", month="Aug", day="3", volume="10", number="8", pages="e37756", keywords="Health Insurance Portability and Accountability Act", keywords="HIPAA", keywords="data privacy", keywords="health", keywords="maps", keywords="safe harbor", keywords="visualization", keywords="patient privacy", doi="10.2196/37756", url="https://medinform.jmir.org/2022/8/e37756", url="http://www.ncbi.nlm.nih.gov/pubmed/35921140" } @Article{info:doi/10.2196/37817, author="Tang, Wentai and Wang, Jian and Lin, Hongfei and Zhao, Di and Xu, Bo and Zhang, Yijia and Yang, Zhihao", title="A Syntactic Information--Based Classification Model for Medical Literature: Algorithm Development and Validation Study", journal="JMIR Med Inform", year="2022", month="Aug", day="2", volume="10", number="8", pages="e37817", keywords="medical relation extraction", keywords="syntactic features", keywords="pruning method", keywords="neural networks", keywords="medical literature", keywords="medical text", keywords="extraction", keywords="syntactic", keywords="classification", keywords="interaction", keywords="text", keywords="literature", keywords="semantic", abstract="Background: The ever-increasing volume of medical literature necessitates the classification of medical literature. Medical relation extraction is a typical method of classifying a large volume of medical literature. With the development of arithmetic power, medical relation extraction models have evolved from rule-based models to neural network models. The single neural network model discards the shallow syntactic information while discarding the traditional rules. Therefore, we propose a syntactic information--based classification model that complements and equalizes syntactic information to enhance the model. Objective: We aim to complete a syntactic information--based relation extraction model for more efficient medical literature classification. Methods: We devised 2 methods for enhancing syntactic information in the model. First, we introduced shallow syntactic information into the convolutional neural network to enhance nonlocal syntactic interactions. Second, we devise a cross-domain pruning method to equalize local and nonlocal syntactic interactions. Results: We experimented with 3 data sets related to the classification of medical literature. The F1 values were 65.5\% and 91.5\% on the BioCreative ViCPR (CPR) and Phenotype-Gene Relationship data sets, respectively, and the accuracy was 88.7\% on the PubMed data set. Our model outperforms the current state-of-the-art baseline model in the experiments. Conclusions: Our model based on syntactic information effectively enhances medical relation extraction. Furthermore, the results of the experiments show that shallow syntactic information helps obtain nonlocal interaction in sentences and effectively reinforces syntactic features. It also provides new ideas for future research directions. ", doi="10.2196/37817", url="https://medinform.jmir.org/2022/8/e37817", url="http://www.ncbi.nlm.nih.gov/pubmed/35917162" } @Article{info:doi/10.2196/33678, author="Baker, William and Colditz, B. Jason and Dobbs, D. Page and Mai, Huy and Visweswaran, Shyam and Zhan, Justin and Primack, A. Brian", title="Classification of Twitter Vaping Discourse Using BERTweet: Comparative Deep Learning Study", journal="JMIR Med Inform", year="2022", month="Jul", day="21", volume="10", number="7", pages="e33678", keywords="vaping", keywords="social media", keywords="deep learning", keywords="transformer models", keywords="infoveillance", abstract="Background: Twitter provides a valuable platform for the surveillance and monitoring of public health topics; however, manually categorizing large quantities of Twitter data is labor intensive and presents barriers to identify major trends and sentiments. Additionally, while machine and deep learning approaches have been proposed with high accuracy, they require large, annotated data sets. Public pretrained deep learning classification models, such as BERTweet, produce higher-quality models while using smaller annotated training sets. Objective: This study aims to derive and evaluate a pretrained deep learning model based on BERTweet that can identify tweets relevant to vaping, tweets (related to vaping) of commercial nature, and tweets with provape sentiment. Additionally, the performance of the BERTweet classifier will be compared against a long short-term memory (LSTM) model to show the improvements a pretrained model has over traditional deep learning approaches. Methods: Twitter data were collected from August to October 2019 using vaping-related search terms. From this set, a random subsample of 2401 English tweets was manually annotated for relevance (vaping related or not), commercial nature (commercial or not), and sentiment (positive, negative, or neutral). Using the annotated data, 3 separate classifiers were built using BERTweet with the default parameters defined by the Simple Transformer application programming interface (API). Each model was trained for 20 iterations and evaluated with a random split of the annotated tweets, reserving 10\% (n=165) of tweets for evaluations. Results: The relevance, commercial, and sentiment classifiers achieved an area under the receiver operating characteristic curve (AUROC) of 94.5\%, 99.3\%, and 81.7\%, respectively. Additionally, the weighted F1 scores of each were 97.6\%, 99.0\%, and 86.1\%, respectively. We found that BERTweet outperformed the LSTM model in the classification of all categories. Conclusions: Large, open-source deep learning classifiers, such as BERTweet, can provide researchers the ability to reliably determine if tweets are relevant to vaping; include commercial content; and include positive, negative, or neutral content about vaping with a higher accuracy than traditional natural language processing deep learning models. Such enhancement to the utilization of Twitter data can allow for faster exploration and dissemination of time-sensitive data than traditional methodologies (eg, surveys, polling research). ", doi="10.2196/33678", url="https://medinform.jmir.org/2022/7/e33678", url="http://www.ncbi.nlm.nih.gov/pubmed/35862172" } @Article{info:doi/10.2196/34204, author="Klimek, Peter and Baltic, Dejan and Brunner, Martin and Degelsegger-Marquez, Alexander and Garh{\"o}fer, Gerhard and Gouya-Lechner, Ghazaleh and Herzog, Arnold and Jilma, Bernd and K{\"a}hler, Stefan and Mikl, Veronika and Mraz, Bernhard and Ostermann, Herwig and R{\"o}hl, Claas and Scharinger, Robert and Stamm, Tanja and Strassnig, Michael and Wirthumer-Hoche, Christa and Pleiner-Duxneuner, Johannes", title="Quality Criteria for Real-world Data in Pharmaceutical Research and Health Care Decision-making: Austrian Expert Consensus", journal="JMIR Med Inform", year="2022", month="Jun", day="17", volume="10", number="6", pages="e34204", keywords="real-world data", keywords="real-world evidence", keywords="data quality", keywords="data quality criteria", keywords="RWD quality recommendations", keywords="pharmaceutical research", keywords="health care decision-making", keywords="quality criteria for RWD in health care", keywords="Gesellschaft f{\"u}r Pharmazeutische Medizin", keywords="GPMed", doi="10.2196/34204", url="https://medinform.jmir.org/2022/6/e34204", url="http://www.ncbi.nlm.nih.gov/pubmed/35713954" } @Article{info:doi/10.2196/34305, author="Khademi Habibabadi, Sedigheh and Delir Haghighi, Pari and Burstein, Frada and Buttery, Jim", title="Vaccine Adverse Event Mining of Twitter Conversations: 2-Phase Classification Study", journal="JMIR Med Inform", year="2022", month="Jun", day="16", volume="10", number="6", pages="e34305", keywords="immunization", keywords="vaccines", keywords="natural language processing", keywords="vaccine adverse effects", keywords="vaccine safety", keywords="social media", keywords="Twitter", keywords="machine learning", abstract="Background: Traditional monitoring for adverse events following immunization (AEFI) relies on various established reporting systems, where there is inevitable lag between an AEFI occurring and its potential reporting and subsequent processing of reports. AEFI safety signal detection strives to detect AEFI as early as possible, ideally close to real time. Monitoring social media data holds promise as a resource for this. Objective: The primary aim of this study is to investigate the utility of monitoring social media for gaining early insights into vaccine safety issues, by extracting vaccine adverse event mentions (VAEMs) from Twitter, using natural language processing techniques. The secondary aims are to document the natural language processing techniques used and identify the most effective of them for identifying tweets that contain VAEM, with a view to define an approach that might be applicable to other similar social media surveillance tasks. Methods: A VAEM-Mine method was developed that combines topic modeling with classification techniques to extract maximal VAEM posts from a vaccine-related Twitter stream, with high degree of confidence. The approach does not require a targeted search for specific vaccine reaction--indicative words, but instead, identifies VAEM posts according to their language structure. Results: The VAEM-Mine method isolated 8992 VAEMs from 811,010 vaccine-related Twitter posts and achieved an F1 score of 0.91 in the classification phase. Conclusions: Social media can assist with the detection of vaccine safety signals as a valuable complementary source for monitoring mentions of vaccine adverse events. A social media--based VAEM data stream can be assessed for changes to detect possible emerging vaccine safety signals, helping to address the well-recognized limitations of passive reporting systems, including lack of timeliness and underreporting. ", doi="10.2196/34305", url="https://medinform.jmir.org/2022/6/e34305", url="http://www.ncbi.nlm.nih.gov/pubmed/35708760" } @Article{info:doi/10.2196/35422, author="Fr{\"a}nti, Pasi and Sieranoja, Sami and Wikstr{\"o}m, Katja and Laatikainen, Tiina", title="Clustering Diagnoses From 58 Million Patient Visits in Finland Between 2015 and 2018", journal="JMIR Med Inform", year="2022", month="May", day="4", volume="10", number="5", pages="e35422", keywords="multimorbidity", keywords="cluster analysis", keywords="disease co-occurrence", keywords="multimorbidity network", keywords="health care data analysis", keywords="graph clustering", keywords="k-means", keywords="data analysis", keywords="cluster", keywords="machine learning", keywords="comorbidity", keywords="register", keywords="big data", keywords="Finland", keywords="Europe", keywords="health record", abstract="Background: Multiple chronic diseases in patients are a major burden on the health service system. Currently, diseases are mostly treated separately without paying sufficient attention to their relationships, which results in the fragmentation of the care process. The better integration of services can lead to the more effective organization of the overall health care system. Objective: This study aimed to analyze the connections between diseases based on their co-occurrences to support decision-makers in better organizing health care services. Methods: We performed a cluster analysis of diagnoses by using data from the Finnish Health Care Registers for primary and specialized health care visits and inpatient care. The target population of this study comprised those 3.8 million individuals (3,835,531/5,487,308, 69.90\% of the whole population) aged ?18 years who used health care services from the years 2015 to 2018. They had a total of 58 million visits. Clustering was performed based on the co-occurrence of diagnoses. The more the same pair of diagnoses appeared in the records of the same patients, the more the diagnoses correlated with each other. On the basis of the co-occurrences, we calculated the relative risk of each pair of diagnoses and clustered the data by using a graph-based clustering algorithm called the M-algorithm---a variant of k-means. Results: The results revealed multimorbidity clusters, of which some were expected (eg, one representing hypertensive and cardiovascular diseases). Other clusters were more unexpected, such as the cluster containing lower respiratory tract diseases and systemic connective tissue disorders. The annual cost of all clusters was {\texteuro}10.0 billion, and the costliest cluster was cardiovascular and metabolic problems, costing {\texteuro}2.3 billion. Conclusions: The method and the achieved results provide new insights into identifying key multimorbidity groups, especially those resulting in burden and costs in health care services. ", doi="10.2196/35422", url="https://medinform.jmir.org/2022/5/e35422", url="http://www.ncbi.nlm.nih.gov/pubmed/35507390" } @Article{info:doi/10.2196/30898, author="Ye, Jiancheng and Wang, Zidan and Hai, Jiarui", title="Social Networking Service, Patient-Generated Health Data, and Population Health Informatics: National Cross-sectional Study of Patterns and Implications of Leveraging Digital Technologies to Support Mental Health and Well-being", journal="J Med Internet Res", year="2022", month="Apr", day="29", volume="24", number="4", pages="e30898", keywords="patient-generated health data", keywords="social network", keywords="population health informatics", keywords="mental health", keywords="social determinants of health", keywords="health data sharing", keywords="technology acceptability", keywords="mobile phone", keywords="mobile health", abstract="Background: The emerging health technologies and digital services provide effective ways of collecting health information and gathering patient-generated health data (PGHD), which provide a more holistic view of a patient's health and quality of life over time, increase visibility into a patient's adherence to a treatment plan or study protocol, and enable timely intervention before a costly care episode. Objective: Through a national cross-sectional survey in the United States, we aimed to describe and compare the characteristics of populations with and without mental health issues (depression or anxiety disorders), including physical health, sleep, and alcohol use. We also examined the patterns of social networking service use, PGHD, and attitudes toward health information sharing and activities among the participants, which provided nationally representative estimates. Methods: We drew data from the 2019 Health Information National Trends Survey of the National Cancer Institute. The participants were divided into 2 groups according to mental health status. Then, we described and compared the characteristics of the social determinants of health, health status, sleeping and drinking behaviors, and patterns of social networking service use and health information data sharing between the 2 groups. Multivariable logistic regression models were applied to assess the predictors of mental health. All the analyses were weighted to provide nationally representative estimates. Results: Participants with mental health issues were significantly more likely to be younger, White, female, and lower-income; have a history of chronic diseases; and be less capable of taking care of their own health. Regarding behavioral health, they slept <6 hours on average, had worse sleep quality, and consumed more alcohol. In addition, they were more likely to visit and share health information on social networking sites, write online diary blogs, participate in online forums or support groups, and watch health-related videos. Conclusions: This study illustrates that individuals with mental health issues have inequitable social determinants of health, poor physical health, and poor behavioral health. However, they are more likely to use social networking platforms and services, share their health information, and actively engage with PGHD. Leveraging these digital technologies and services could be beneficial for developing tailored and effective strategies for self-monitoring and self-management. ", doi="10.2196/30898", url="https://www.jmir.org/2022/4/e30898", url="http://www.ncbi.nlm.nih.gov/pubmed/35486428" } @Article{info:doi/10.2196/32776, author="Yuan, Junyi and Wang, Sufen and Pan, Changqing", title="Mechanism of Impact of Big Data Resources on Medical Collaborative Networks From the Perspective of Transaction Efficiency of Medical Services: Survey Study", journal="J Med Internet Res", year="2022", month="Apr", day="21", volume="24", number="4", pages="e32776", keywords="medical collaborative networks", keywords="big data resources", keywords="transaction efficiency", abstract="Background: The application of big data resources and the development of medical collaborative networks (MCNs) boost each other. However, MCNs are often assumed to be exogenous. How big data resources affect the emergence, development, and evolution of endogenous MCNs has not been well explained. Objective: This study aimed to explore and understand the influence of the mechanism of a wide range of shared and private big data resources on the transaction efficiency of medical services to reveal the impact of big data resources on the emergence and development of endogenous MCNs. Methods: This study was conducted by administering a survey questionnaire to information technology staff and medical staff from 132 medical institutions in China. Data from information technology staff and medical staff were integrated. Structural equation modeling was used to test the direct impact of big data resources on transaction efficiency of medical services. For those big data resources that had no direct impact, we analyzed their indirect impact. Results: Sharing of diagnosis and treatment data ($\beta$=.222; P=.03) and sharing of medical research data ($\beta$=.289; P=.04) at the network level (as big data itself) positively directly affected the transaction efficiency of medical services. Network protection of the external link systems ($\beta$=.271; P=.008) at the level of medical institutions (as big data technology) positively directly affected the transaction efficiency of medical services. Encryption security of web-based data (as big data technology) at the level of medical institutions, medical service capacity available for external use, real-time data of diagnosis and treatment services (as big data itself) at the level of medical institutions, and policies and regulations at the network level indirectly affected the transaction efficiency through network protection of the external link systems at the level of medical institutions. Conclusions: This study found that big data technology, big data itself, and policy at the network and organizational levels interact with, and influence, each other to form the transaction efficiency of medical services. On the basis of the theory of neoclassical economics, the study highlighted the implications of big data resources for the emergence and development of endogenous MCNs. ", doi="10.2196/32776", url="https://www.jmir.org/2022/4/e32776", url="http://www.ncbi.nlm.nih.gov/pubmed/35318187" } @Article{info:doi/10.2196/36481, author="Wang, Miye and Li, Sheyu and Zheng, Tao and Li, Nan and Shi, Qingke and Zhuo, Xuejun and Ding, Renxin and Huang, Yong", title="Big Data Health Care Platform With Multisource Heterogeneous Data Integration and Massive High-Dimensional Data Governance for Large Hospitals: Design, Development, and Application", journal="JMIR Med Inform", year="2022", month="Apr", day="13", volume="10", number="4", pages="e36481", keywords="big data platform in health care", keywords="multisource", keywords="heterogeneous", keywords="data integration", keywords="data governance", keywords="data application", keywords="data security", keywords="data quality control", keywords="big data", keywords="data science", keywords="medical informatics", keywords="health care", abstract="Background: With the advent of data-intensive science, a full integration of big data science and health care will bring a cross-field revolution to the medical community in China. The concept big data represents not only a technology but also a resource and a method. Big data are regarded as an important strategic resource both at the national level and at the medical institutional level, thus great importance has been attached to the construction of a big data platform for health care. Objective: We aimed to develop and implement a big data platform for a large hospital, to overcome difficulties in integrating, calculating, storing, and governing multisource heterogeneous data in a standardized way, as well as to ensure health care data security. Methods: The project to build a big data platform at West China Hospital of Sichuan University was launched in 2017. The West China Hospital of Sichuan University big data platform has extracted, integrated, and governed data from different departments and sections of the hospital since January 2008. A master--slave mode was implemented to realize the real-time integration of multisource heterogeneous massive data, and an environment that separates heterogeneous characteristic data storage and calculation processes was built. A business-based metadata model was improved for data quality control, and a standardized health care data governance system and scientific closed-loop data security ecology were established. Results: After 3 years of design, development, and testing, the West China Hospital of Sichuan University big data platform was formally brought online in November 2020. It has formed a massive multidimensional data resource database, with more than 12.49 million patients, 75.67 million visits, and 8475 data variables. Along with hospital operations data, newly generated data are entered into the platform in real time. Since its launch, the platform has supported more than 20 major projects and provided data service, storage, and computing power support to many scientific teams, facilitating a shift in the data support model---from conventional manual extraction to self-service retrieval (which has reached 8561 retrievals per month). Conclusions: The platform can combine operation systems data from all departments and sections in a hospital to form a massive high-dimensional high-quality health care database that allows electronic medical records to be used effectively and taps into the value of data to fully support clinical services, scientific research, and operations management. The West China Hospital of Sichuan University big data platform can successfully generate multisource heterogeneous data storage and computing power. By effectively governing massive multidimensional data gathered from multiple sources, the West China Hospital of Sichuan University big data platform provides highly available data assets and thus has a high application value in the health care field. The West China Hospital of Sichuan University big data platform facilitates simpler and more efficient utilization of electronic medical record data for real-world research. ", doi="10.2196/36481", url="https://medinform.jmir.org/2022/4/e36481", url="http://www.ncbi.nlm.nih.gov/pubmed/35416792" } @Article{info:doi/10.2196/32578, author="Chew, Jocelyn Han Shi", title="The Use of Artificial Intelligence--Based Conversational Agents (Chatbots) for Weight Loss: Scoping Review and Practical Recommendations", journal="JMIR Med Inform", year="2022", month="Apr", day="13", volume="10", number="4", pages="e32578", keywords="chatbot", keywords="conversational agent", keywords="artificial intelligence", keywords="weight loss", keywords="obesity", keywords="overweight", keywords="natural language processing", keywords="sentiment analysis", keywords="machine learning", keywords="behavior change", keywords="mobile phone", abstract="Background: Overweight and obesity have now reached a state of a pandemic despite the clinical and commercial programs available. Artificial intelligence (AI) chatbots have a strong potential in optimizing such programs for weight loss. Objective: This study aimed to review AI chatbot use cases for weight loss and to identify the essential components for prolonging user engagement. Methods: A scoping review was conducted using the 5-stage framework by Arksey and O'Malley. Articles were searched across nine electronic databases (ACM Digital Library, CINAHL, Cochrane Central, Embase, IEEE Xplore, PsycINFO, PubMed, Scopus, and Web of Science) until July 9, 2021. Gray literature, reference lists, and Google Scholar were also searched. Results: A total of 23 studies with 2231 participants were included and evaluated in this review. Most studies (8/23, 35\%) focused on using AI chatbots to promote both a healthy diet and exercise, 13\% (3/23) of the studies used AI chatbots solely for lifestyle data collection and obesity risk assessment whereas only 4\% (1/23) of the studies focused on promoting a combination of a healthy diet, exercise, and stress management. In total, 48\% (11/23) of the studies used only text-based AI chatbots, 52\% (12/23) operationalized AI chatbots through smartphones, and 39\% (9/23) integrated data collected through fitness wearables or Internet of Things appliances. The core functions of AI chatbots were to provide personalized recommendations (20/23, 87\%), motivational messages (18/23, 78\%), gamification (6/23, 26\%), and emotional support (6/23, 26\%). Study participants who experienced speech- and augmented reality--based chatbot interactions in addition to text-based chatbot interactions reported higher user engagement because of the convenience of hands-free interactions. Enabling conversations through multiple platforms (eg, SMS text messaging, Slack, Telegram, Signal, WhatsApp, or Facebook Messenger) and devices (eg, laptops, Google Home, and Amazon Alexa) was reported to increase user engagement. The human semblance of chatbots through verbal and nonverbal cues improved user engagement through interactivity and empathy. Other techniques used in text-based chatbots included personally and culturally appropriate colloquial tones and content; emojis that emulate human emotional expressions; positively framed words; citations of credible information sources; personification; validation; and the provision of real-time, fast, and reliable recommendations. Prevailing issues included privacy; accountability; user burden; and interoperability with other databases, third-party applications, social media platforms, devices, and appliances. Conclusions: AI chatbots should be designed to be human-like, personalized, contextualized, immersive, and enjoyable to enhance user experience, engagement, behavior change, and weight loss. These require the integration of health metrics (eg, based on self-reports and wearable trackers), personality and preferences (eg, based on goal achievements), circumstantial behaviors (eg, trigger-based overconsumption), and emotional states (eg, chatbot conversations and wearable stress detectors) to deliver personalized and effective recommendations for weight loss. ", doi="10.2196/32578", url="https://medinform.jmir.org/2022/4/e32578", url="http://www.ncbi.nlm.nih.gov/pubmed/35416791" } @Article{info:doi/10.2196/33680, author="Gunasekeran, Visva Dinesh and Chew, Alton and Chandrasekar, K. Eeshwar and Rajendram, Priyanka and Kandarpa, Vasundhara and Rajendram, Mallika and Chia, Audrey and Smith, Helen and Leong, Kit Choon", title="The Impact and Applications of Social Media Platforms for Public Health Responses Before and During the COVID-19 Pandemic: Systematic Literature Review", journal="J Med Internet Res", year="2022", month="Apr", day="11", volume="24", number="4", pages="e33680", keywords="digital health", keywords="social media", keywords="big data", keywords="population health", keywords="blockchain", keywords="COVID-19", keywords="review", keywords="benefit", keywords="challenge", keywords="public health", abstract="Background: ?Social media platforms have numerous potential benefits and drawbacks on public health, which have been described in the literature. The COVID-19 pandemic has exposed our limited knowledge regarding the potential health impact of these platforms, which have been detrimental to public health responses in many regions. Objective: This review aims to highlight a brief history of social media in health care and report its potential negative and positive public health impacts, which have been characterized in the literature. Methods: ?We searched electronic bibliographic databases including PubMed, including Medline and Institute of Electrical and Electronics Engineers Xplore, from December 10, 2015, to December 10, 2020. We screened the title and abstracts and selected relevant reports for review of full text and reference lists. These were analyzed thematically and consolidated into applications of social media platforms for public health. Results: ?The positive and negative impact of social media platforms on public health are catalogued on the basis of recent research in this report. These findings are discussed in the context of improving future public health responses and incorporating other emerging digital technology domains such as artificial intelligence. However, there is a need for more research with pragmatic methodology that evaluates the impact of specific digital interventions to inform future health policy. Conclusions: ?Recent research has highlighted the potential negative impact of social media platforms on population health, as well as potentially useful applications for public health communication, monitoring, and predictions. More research is needed to objectively investigate measures to mitigate against its negative impact while harnessing effective applications for the benefit of public health. ", doi="10.2196/33680", url="https://www.jmir.org/2022/4/e33680", url="http://www.ncbi.nlm.nih.gov/pubmed/35129456" } @Article{info:doi/10.2196/29385, author="Alexander, George and Bahja, Mohammed and Butt, Farook Gibran", title="Automating Large-scale Health Care Service Feedback Analysis: Sentiment Analysis and Topic Modeling Study", journal="JMIR Med Inform", year="2022", month="Apr", day="11", volume="10", number="4", pages="e29385", keywords="natural language processing", keywords="topic modeling", keywords="National Health Service", keywords="latent Dirichlet allocation", keywords="reviews", keywords="patient feedback", keywords="automated solutions", keywords="large-scale health service", keywords="free-text", keywords="unstructured data", abstract="Background: Obtaining patient feedback is an essential mechanism for health care service providers to assess their quality and effectiveness. Unlike assessments of clinical outcomes, feedback from patients offers insights into their lived experiences. The Department of Health and Social Care in England via National Health Service Digital operates a patient feedback web service through which patients can leave feedback of their experiences in structured and free-text report forms. Free-text feedback, compared with structured questionnaires, may be less biased by the feedback collector and, thus, more representative; however, it is harder to analyze in large quantities and challenging to derive meaningful, quantitative outcomes. Objective: The aim of this study is to build a novel data analysis and interactive visualization pipeline accessible through an interactive web application to facilitate the interrogation of and provide unique insights into National Health Service patient feedback. Methods: This study details the development of a text analysis tool that uses contemporary natural language processing and machine learning models to analyze free-text clinical service reviews to develop a robust classification model and interactive visualization web application. The methodology is based on the design science research paradigm and was conducted in three iterations: a sentiment analysis of the patient feedback corpus in the first iteration, topic modeling (unigram and bigram)--based analysis for topic identification in the second iteration, and nested topic modeling in the third iteration that combines sentiment analysis and topic modeling methods. An interactive data visualization web application for use by the general public was then created, presenting the data on a geographic representation of the country, making it easily accessible. Results: Of the 11,103 possible clinical services that could be reviewed across England, 2030 (18.28\%) different services received a combined total of 51,845 reviews between October 1, 2017, and September 30, 2019. Dominant topics were identified for the entire corpus followed by negative- and positive-sentiment topics in turn. Reviews containing high- and low-sentiment topics occurred more frequently than reviews containing less polarized topics. Time-series analysis identified trends in topic and sentiment occurrence frequency across the study period. Conclusions: Using contemporary natural language processing techniques, unstructured text data were effectively characterized for further analysis and visualization. An efficient pipeline was successfully combined with a web application, making automated analysis and dissemination of large volumes of information accessible. This study represents a significant step in efforts to generate and visualize useful, actionable, and unique information from free-text patient reviews. ", doi="10.2196/29385", url="https://medinform.jmir.org/2022/4/e29385", url="http://www.ncbi.nlm.nih.gov/pubmed/35404254" } @Article{info:doi/10.2196/35734, author="El Emam, Khaled and Mosquera, Lucy and Fang, Xi and El-Hussuna, Alaa", title="Utility Metrics for Evaluating Synthetic Health Data Generation Methods: Validation Study", journal="JMIR Med Inform", year="2022", month="Apr", day="7", volume="10", number="4", pages="e35734", keywords="synthetic data", keywords="data utility", keywords="data privacy", keywords="generative models", keywords="utility metric", keywords="synthetic data generation", keywords="logistic regression", keywords="model validation", keywords="medical informatics", keywords="binary prediction model", keywords="prediction model", abstract="Background: A regular task by developers and users of synthetic data generation (SDG) methods is to evaluate and compare the utility of these methods. Multiple utility metrics have been proposed and used to evaluate synthetic data. However, they have not been validated in general or for comparing SDG methods. Objective: This study evaluates the ability of common utility metrics to rank SDG methods according to performance on a specific analytic workload. The workload of interest is the use of synthetic data for logistic regression prediction models, which is a very frequent workload in health research. Methods: We evaluated 6 utility metrics on 30 different health data sets and 3 different SDG methods (a Bayesian network, a Generative Adversarial Network, and sequential tree synthesis). These metrics were computed by averaging across 20 synthetic data sets from the same generative model. The metrics were then tested on their ability to rank the SDG methods based on prediction performance. Prediction performance was defined as the difference between each of the area under the receiver operating characteristic curve and area under the precision-recall curve values on synthetic data logistic regression prediction models versus real data models. Results: The utility metric best able to rank SDG methods was the multivariate Hellinger distance based on a Gaussian copula representation of real and synthetic joint distributions. Conclusions: This study has validated a generative model utility metric, the multivariate Hellinger distance, which can be used to reliably rank competing SDG methods on the same data set. The Hellinger distance metric can be used to evaluate and compare alternate SDG methods. ", doi="10.2196/35734", url="https://medinform.jmir.org/2022/4/e35734", url="http://www.ncbi.nlm.nih.gov/pubmed/35389366" } @Article{info:doi/10.2196/35073, author="Haithcoat, Timothy and Liu, Danlu and Young, Tiffany and Shyu, Chi-Ren", title="Investigating Health Context Using a Spatial Data Analytical Tool: Development of a Geospatial Big Data Ecosystem", journal="JMIR Med Inform", year="2022", month="Apr", day="6", volume="10", number="4", pages="e35073", keywords="context", keywords="Geographic Information System", keywords="big data", keywords="equity", keywords="population health", keywords="public health", keywords="digital health", keywords="eHealth", keywords="location", keywords="geospatial", keywords="data analytics", keywords="analytical framework", keywords="medical informatics", keywords="research knowledgebase", abstract="Background: Enabling the use of spatial context is vital to understanding today's digital health problems. Any given location is associated with many different contexts. The strategic transformation of population health, epidemiology, and eHealth studies requires vast amounts of integrated digital data. Needed is a novel analytical framework designed to leverage location to create new contextual knowledge. The Geospatial Analytical Research Knowledgebase (GeoARK), a web-based research resource has robust, locationally integrated, social, environmental, and infrastructural information to address today's complex questions, investigate context, and spatially enable health investigations. GeoARK is different from other Geographic Information System (GIS) resources in that it has taken the layered world of the GIS and flattened it into a big data table that ties all the data and information together using location and developing its context. Objective: It is paramount to build a robust spatial data analytics framework that integrates social, environmental, and infrastructural knowledge to empower health researchers' use of geospatial context to timely answer population health issues. The goal is twofold in that it embodies an innovative technological approach and serves to ease the educational burden for health researchers to think spatially about their problems. Methods: A unique analytical tool using location as the key was developed. It allows integration across source, geography, and time to create a geospatial big table with over 162 million individual locations (X-Y points that serve as rows) and 5549 attributes (represented as columns). The concept of context (adjacency, proximity, distance, etc) is quantified through geoanalytics and captured as new distance, density, or neighbor attributes within the system. Development of geospatial analytics permits contextual extraction and investigator-initiated eHealth and mobile health (mHealth) analysis across multiple attributes. Results: We built a unique geospatial big data ecosystem called GeoARK. Analytics on this big table occur across resolution groups, sources, and geographies for extraction and analysis of information to gain new insights. Case studies, including telehealth assessment in North Carolina, national income inequality and health outcome disparity, and a Missouri COVID-19 risk assessment, demonstrate the capability to support robust and efficient geospatial understanding of a wide spectrum of population health questions. Conclusions: This research identified, compiled, transformed, standardized, and integrated multifaceted data required to better understand the context of health events within a large location-enabled database. The GeoARK system empowers health professionals to engage more complex research where the synergisms of health and geospatial information will be robustly studied beyond what could be accomplished today. No longer is the need to know how to perform geospatial processing an impediment to the health researcher, but rather the development of how to think spatially becomes the greater challenge. ", doi="10.2196/35073", url="https://medinform.jmir.org/2022/4/e35073", url="http://www.ncbi.nlm.nih.gov/pubmed/35311683" } @Article{info:doi/10.2196/35253, author="Wang, Alex and McCarron, Robert and Azzam, Daniel and Stehli, Annamarie and Xiong, Glen and DeMartini, Jeremy", title="Utilizing Big Data From Google Trends to Map Population Depression in the United States: Exploratory Infodemiology Study", journal="JMIR Ment Health", year="2022", month="Mar", day="31", volume="9", number="3", pages="e35253", keywords="depression", keywords="epidemiology", keywords="internet", keywords="google trends", keywords="big data", keywords="mental health", abstract="Background: The epidemiology of mental health disorders has important theoretical and practical implications for health care service and planning. The recent increase in big data storage and subsequent development of analytical tools suggest that mining search databases may yield important trends on mental health, which can be used to support existing population health studies. Objective: This study aimed to map depression search intent in the United States based on internet-based mental health queries. Methods: Weekly data on mental health searches were extracted from Google Trends for an 11-year period (2010-2021) and separated by US state for the following terms: ``feeling sad,'' ``depressed,'' ``depression,'' ``empty,'' ``insomnia,'' ``fatigue,'' ``guilty,'' ``feeling guilty,'' and ``suicide.'' Multivariable regression models were created based on geographic and environmental factors and normalized to the following control terms: ``sports,'' ``news,'' ``google,'' ``youtube,'' ``facebook,'' and ``netflix.'' Heat maps of population depression were generated based on search intent. Results: Depression search intent grew 67\% from January 2010 to March 2021. Depression search intent showed significant seasonal patterns with peak intensity during winter (adjusted P<.001) and early spring months (adjusted P<.001), relative to summer months. Geographic location correlated with depression search intent with states in the Northeast (adjusted P=.01) having higher search intent than states in the South. Conclusions: The trends extrapolated from Google Trends successfully correlate with known risk factors for depression, such as seasonality and increasing latitude. These findings suggest that Google Trends may be a valid novel epidemiological tool to map depression prevalence in the United States. ", doi="10.2196/35253", url="https://mental.jmir.org/2022/3/e35253", url="http://www.ncbi.nlm.nih.gov/pubmed/35357320" } @Article{info:doi/10.2196/31021, author="Almowil, Zahra and Zhou, Shang-Ming and Brophy, Sinead and Croxall, Jodie", title="Concept Libraries for Repeatable and Reusable Research: Qualitative Study Exploring the Needs of Users", journal="JMIR Hum Factors", year="2022", month="Mar", day="15", volume="9", number="1", pages="e31021", keywords="electronic health records", keywords="record linkage", keywords="reproducible research", keywords="clinical codes", keywords="concept libraries", abstract="Background: Big data research in the field of health sciences is hindered by a lack of agreement on how to identify and define different conditions and their medications. This means that researchers and health professionals often have different phenotype definitions for the same condition. This lack of agreement makes it difficult to compare different study findings and hinders the ability to conduct repeatable and reusable research. Objective: This study aims to examine the requirements of various users, such as researchers, clinicians, machine learning experts, and managers, in the development of a data portal for phenotypes (a concept library). Methods: This was a qualitative study using interviews and focus group discussion. One-to-one interviews were conducted with researchers, clinicians, machine learning experts, and senior research managers in health data science (N=6) to explore their specific needs in the development of a concept library. In addition, a focus group discussion with researchers (N=14) working with the Secured Anonymized Information Linkage databank, a national eHealth data linkage infrastructure, was held to perform a SWOT (strengths, weaknesses, opportunities, and threats) analysis for the phenotyping system and the proposed concept library. The interviews and focus group discussion were transcribed verbatim, and 2 thematic analyses were performed. Results: Most of the participants thought that the prototype concept library would be a very helpful resource for conducting repeatable research, but they specified that many requirements are needed before its development. Although all the participants stated that they were aware of some existing concept libraries, most of them expressed negative perceptions about them. The participants mentioned several facilitators that would stimulate them to share their work and reuse the work of others, and they pointed out several barriers that could inhibit them from sharing their work and reusing the work of others. The participants suggested some developments that they would like to see to improve reproducible research output using routine data. Conclusions: The study indicated that most interviewees valued a concept library for phenotypes. However, only half of the participants felt that they would contribute by providing definitions for the concept library, and they reported many barriers regarding sharing their work on a publicly accessible platform. Analysis of interviews and the focus group discussion revealed that different stakeholders have different requirements, facilitators, barriers, and concerns about a prototype concept library. ", doi="10.2196/31021", url="https://humanfactors.jmir.org/2022/1/e31021", url="http://www.ncbi.nlm.nih.gov/pubmed/35289755" } @Article{info:doi/10.2196/31684, author="Gao, Chuang and McGilchrist, Mark and Mumtaz, Shahzad and Hall, Christopher and Anderson, Ann Lesley and Zurowski, John and Gordon, Sharon and Lumsden, Joanne and Munro, Vicky and Wozniak, Artur and Sibley, Michael and Banks, Christopher and Duncan, Chris and Linksted, Pamela and Hume, Alastair and Stables, L. Catherine and Mayor, Charlie and Caldwell, Jacqueline and Wilde, Katie and Cole, Christian and Jefferson, Emily", title="A National Network of Safe Havens: Scottish Perspective", journal="J Med Internet Res", year="2022", month="Mar", day="9", volume="24", number="3", pages="e31684", keywords="electronic health records", keywords="Safe Haven", keywords="data governance", doi="10.2196/31684", url="https://www.jmir.org/2022/3/e31684", url="http://www.ncbi.nlm.nih.gov/pubmed/35262495" } @Article{info:doi/10.2196/30328, author="Evans, Richard and Burns, Jennifer and Damschroder, Laura and Annis, Ann and Freitag, B. Michelle and Raffa, Susan and Wiitala, Wyndy", title="Deriving Weight From Big Data: Comparison of Body Weight Measurement--Cleaning Algorithms", journal="JMIR Med Inform", year="2022", month="Mar", day="9", volume="10", number="3", pages="e30328", keywords="veterans", keywords="weight", keywords="algorithms", keywords="obesity", keywords="measurement", keywords="electronic health record", abstract="Background: Patient body weight is a frequently used measure in biomedical studies, yet there are no standard methods for processing and cleaning weight data. Conflicting documentation on constructing body weight measurements presents challenges for research and program evaluation. Objective: In this study, we aim to describe and compare methods for extracting and cleaning weight data from electronic health record databases to develop guidelines for standardized approaches that promote reproducibility. Methods: We conducted a systematic review of studies published from 2008 to 2018 that used Veterans Health Administration electronic health record weight data and documented the algorithms for constructing patient weight. We applied these algorithms to a cohort of veterans with at least one primary care visit in 2016. The resulting weight measures were compared at the patient and site levels. Results: We identified 496 studies and included 62 (12.5\%) that used weight as an outcome. Approximately 48\% (27/62) included a replicable algorithm. Algorithms varied from cutoffs of implausible weights to complex models using measures within patients over time. We found differences in the number of weight values after applying the algorithms (71,961/1,175,995, 6.12\% to 1,175,177/1,175,995, 99.93\% of raw data) but little difference in average weights across methods (93.3, SD 21.0 kg to 94.8, SD 21.8 kg). The percentage of patients with at least 5\% weight loss over 1 year ranged from 9.37\% (4933/52,642) to 13.99\% (3355/23,987). Conclusions: Contrasting algorithms provide similar results and, in some cases, the results are not different from using raw, unprocessed data despite algorithm complexity. Studies using point estimates of weight may benefit from a simple cleaning rule based on cutoffs of implausible values; however, research questions involving weight trajectories and other, more complex scenarios may benefit from a more nuanced algorithm that considers all available weight data. ", doi="10.2196/30328", url="https://medinform.jmir.org/2022/3/e30328", url="http://www.ncbi.nlm.nih.gov/pubmed/35262492" } @Article{info:doi/10.2196/27146, author="Wang, Liya and Qiu, Hang and Luo, Li and Zhou, Li", title="Age- and Sex-Specific Differences in Multimorbidity Patterns and Temporal Trends on Assessing Hospital Discharge Records in Southwest China: Network-Based Study", journal="J Med Internet Res", year="2022", month="Feb", day="25", volume="24", number="2", pages="e27146", keywords="multimorbidity pattern", keywords="temporal trend", keywords="network analysis", keywords="multimorbidity prevalence", keywords="administrative data", keywords="longitudinal study", keywords="regional research", abstract="Background: Multimorbidity represents a global health challenge, which requires a more global understanding of multimorbidity patterns and trends. However, the majority of studies completed to date have often relied on self-reported conditions, and a simultaneous assessment of the entire spectrum of chronic disease co-occurrence, especially in developing regions, has not yet been performed. Objective: We attempted to provide a multidimensional approach to understand the full spectrum of chronic disease co-occurrence among general inpatients in southwest China, in order to investigate multimorbidity patterns and temporal trends, and assess their age and sex differences. Methods: We conducted a retrospective cohort analysis based on 8.8 million hospital discharge records of about 5.0 million individuals of all ages from 2015 to 2019 in a megacity in southwest China. We examined all chronic diagnoses using the ICD-10 (International Classification of Diseases, 10th revision) codes at 3 digits and focused on chronic diseases with ?1\% prevalence for each of the age and sex strata, which resulted in a total of 149 and 145 chronic diseases in males and females, respectively. We constructed multimorbidity networks in the general population based on sex and age, and used the cosine index to measure the co-occurrence of chronic diseases. Then, we divided the networks into communities and assessed their temporal trends. Results: The results showed complex interactions among chronic diseases, with more intensive connections among males and inpatients ?40 years old. A total of 9 chronic diseases were simultaneously classified as central diseases, hubs, and bursts in the multimorbidity networks. Among them, 5 diseases were common to both males and females, including hypertension, chronic ischemic heart disease, cerebral infarction, other cerebrovascular diseases, and atherosclerosis. The earliest leaps (degree leaps ?6) appeared at a disorder of glycoprotein metabolism that happened at 25-29 years in males, about 15 years earlier than in females. The number of chronic diseases in the community increased over time, but the new entrants did not replace the root of the community. Conclusions: Our multimorbidity network analysis identified specific differences in the co-occurrence of chronic diagnoses by sex and age, which could help in the design of clinical interventions for inpatient multimorbidity. ", doi="10.2196/27146", url="https://www.jmir.org/2022/2/e27146", url="http://www.ncbi.nlm.nih.gov/pubmed/35212632" } @Article{info:doi/10.2196/27534, author="Chishtie, Jawad and Bielska, Anna Iwona and Barrera, Aldo and Marchand, Jean-Sebastien and Imran, Muhammad and Tirmizi, Ali Syed Farhan and Turcotte, A. Luke and Munce, Sarah and Shepherd, John and Senthinathan, Arrani and Cepoiu-Martin, Monica and Irvine, Michael and Babineau, Jessica and Abudiab, Sally and Bjelica, Marko and Collins, Christopher and Craven, Catharine B. and Guilcher, Sara and Jeji, Tara and Naraei, Parisa and Jaglal, Susan", title="Interactive Visualization Applications in Population Health and Health Services Research: Systematic Scoping Review", journal="J Med Internet Res", year="2022", month="Feb", day="18", volume="24", number="2", pages="e27534", keywords="interactive visualization", keywords="data visualization", keywords="secondary health care data", keywords="public health informatics", keywords="population health", keywords="health services research", abstract="Background: Simple visualizations in health research data, such as scatter plots, heat maps, and bar charts, typically present relationships between 2 variables. Interactive visualization methods allow for multiple related facets such as numerous risk factors to be studied simultaneously, leading to data insights through exploring trends and patterns from complex big health care data. The technique presents a powerful tool that can be used in combination with statistical analysis for knowledge discovery, hypothesis generation and testing, and decision support. Objective: The primary objective of this scoping review is to describe and summarize the evidence of interactive visualization applications, methods, and tools being used in population health and health services research (HSR) and their subdomains in the last 15 years, from January 1, 2005, to March 30, 2019. Our secondary objective is to describe the use cases, metrics, frameworks used, settings, target audience, goals, and co-design of applications. Methods: We adapted standard scoping review guidelines with a peer-reviewed search strategy: 2 independent researchers at each stage of screening and abstraction, with a third independent researcher to arbitrate conflicts and validate findings. A comprehensive abstraction platform was built to capture the data from diverse bodies of literature, primarily from the computer science and health care sectors. After screening 11,310 articles, we present findings from 56 applications from interrelated areas of population health and HSR, as well as their subdomains such as epidemiologic surveillance, health resource planning, access, and use and costs among diverse clinical and demographic populations. Results: In this companion review to our earlier systematic synthesis of the literature on visual analytics applications, we present findings in 6 major themes of interactive visualization applications developed for 8 major problem categories. We found a wide application of interactive visualization methods, the major ones being epidemiologic surveillance for infectious disease, resource planning, health service monitoring and quality, and studying medication use patterns. The data sources included mostly secondary administrative and electronic medical record data. In addition, at least two-thirds of the applications involved participatory co-design approaches while introducing a distinct category, embedded research, within co-design initiatives. These applications were in response to an identified need for data-driven insights into knowledge generation and decision support. We further discuss the opportunities stemming from the use of interactive visualization methods in studying global health; inequities, including social determinants of health; and other related areas. We also allude to the challenges in the uptake of these methods. Conclusions: Visualization in health has strong historical roots, with an upward trend in the use of these methods in population health and HSR. Such applications are being fast used by academic and health care agencies for knowledge discovery, hypotheses generation, and decision support. International Registered Report Identifier (IRRID): RR2-10.2196/14019 ", doi="10.2196/27534", url="https://www.jmir.org/2022/2/e27534", url="http://www.ncbi.nlm.nih.gov/pubmed/35179499" } @Article{info:doi/10.2196/34560, author="Bove, Riley and Schleimer, Erica and Sukhanov, Paul and Gilson, Michael and Law, M. Sindy and Barnecut, Andrew and Miller, L. Bruce and Hauser, L. Stephen and Sanders, J. Stephan and Rankin, P. Katherine", title="Building a Precision Medicine Delivery Platform for Clinics: The University of California, San Francisco, BRIDGE Experience", journal="J Med Internet Res", year="2022", month="Feb", day="15", volume="24", number="2", pages="e34560", keywords="precision medicine", keywords="clinical implementation", keywords="in silico trials", keywords="clinical dashboard", keywords="precision", keywords="implementation", keywords="dashboard", keywords="design", keywords="experience", keywords="analytic", keywords="tool", keywords="analysis", keywords="decision-making", keywords="real time", keywords="platform", keywords="human-centered design", doi="10.2196/34560", url="https://www.jmir.org/2022/2/e34560", url="http://www.ncbi.nlm.nih.gov/pubmed/35166689" } @Article{info:doi/10.2196/28036, author="Yu, Jia-Ruei and Chen, Chun-Hsien and Huang, Tsung-Wei and Lu, Jang-Jih and Chung, Chia-Ru and Lin, Ting-Wei and Wu, Min-Hsien and Tseng, Yi-Ju and Wang, Hsin-Yao", title="Energy Efficiency of Inference Algorithms for Clinical Laboratory Data Sets: Green Artificial Intelligence Study", journal="J Med Internet Res", year="2022", month="Jan", day="25", volume="24", number="1", pages="e28036", keywords="medical informatics", keywords="machine learning", keywords="algorithms", keywords="energy consumption", keywords="artificial intelligence", keywords="energy efficient", keywords="medical domain", keywords="medical data sets", keywords="informatics", abstract="Background: The use of artificial intelligence (AI) in the medical domain has attracted considerable research interest. Inference applications in the medical domain require energy-efficient AI models. In contrast to other types of data in visual AI, data from medical laboratories usually comprise features with strong signals. Numerous energy optimization techniques have been developed to relieve the burden on the hardware required to deploy a complex learning model. However, the energy efficiency levels of different AI models used for medical applications have not been studied. Objective: The aim of this study was to explore and compare the energy efficiency levels of commonly used machine learning algorithms---logistic regression (LR), k-nearest neighbor, support vector machine, random forest (RF), and extreme gradient boosting (XGB) algorithms, as well as four different variants of neural network (NN) algorithms---when applied to clinical laboratory datasets. Methods: We applied the aforementioned algorithms to two distinct clinical laboratory data sets: a mass spectrometry data set regarding Staphylococcus aureus for predicting methicillin resistance (3338 cases; 268 features) and a urinalysis data set for predicting Trichomonas vaginalis infection (839,164 cases; 9 features). We compared the performance of the nine inference algorithms in terms of accuracy, area under the receiver operating characteristic curve (AUROC), time consumption, and power consumption. The time and power consumption levels were determined using performance counter data from Intel Power Gadget 3.5. Results: The experimental results indicated that the RF and XGB algorithms achieved the two highest AUROC values for both data sets (84.7\% and 83.9\%, respectively, for the mass spectrometry data set; 91.1\% and 91.4\%, respectively, for the urinalysis data set). The XGB and LR algorithms exhibited the shortest inference time for both data sets (0.47 milliseconds for both in the mass spectrometry data set; 0.39 and 0.47 milliseconds, respectively, for the urinalysis data set). Compared with the RF algorithm, the XGB and LR algorithms exhibited a 45\% and 53\%-60\% reduction in inference time for the mass spectrometry and urinalysis data sets, respectively. In terms of energy efficiency, the XGB algorithm exhibited the lowest power consumption for the mass spectrometry data set (9.42 Watts) and the LR algorithm exhibited the lowest power consumption for the urinalysis data set (9.98 Watts). Compared with a five-hidden-layer NN, the XGB and LR algorithms achieved 16\%-24\% and 9\%-13\% lower power consumption levels for the mass spectrometry and urinalysis data sets, respectively. In all experiments, the XGB algorithm exhibited the best performance in terms of accuracy, run time, and energy efficiency. Conclusions: The XGB algorithm achieved balanced performance levels in terms of AUROC, run time, and energy efficiency for the two clinical laboratory data sets. Considering the energy constraints in real-world scenarios, the XGB algorithm is ideal for medical AI applications. ", doi="10.2196/28036", url="https://www.jmir.org/2022/1/e28036", url="http://www.ncbi.nlm.nih.gov/pubmed/35076405" } @Article{info:doi/10.2196/28842, author="Kumar, Sajit and Nanelia, Alicia and Mariappan, Ragunathan and Rajagopal, Adithya and Rajan, Vaibhav", title="Patient Representation Learning From Heterogeneous Data Sources and Knowledge Graphs Using Deep Collective Matrix Factorization: Evaluation Study", journal="JMIR Med Inform", year="2022", month="Jan", day="20", volume="10", number="1", pages="e28842", keywords="representation learning", keywords="deep collective matrix factorization", keywords="electronic medical records", keywords="knowledge graphs", keywords="multiview learning", keywords="graph embeddings", keywords="clinical decision support", abstract="Background: Patient representation learning aims to learn features, also called representations, from input sources automatically, often in an unsupervised manner, for use in predictive models. This obviates the need for cumbersome, time- and resource-intensive manual feature engineering, especially from unstructured data such as text, images, or graphs. Most previous techniques have used neural network--based autoencoders to learn patient representations, primarily from clinical notes in electronic medical records (EMRs). Knowledge graphs (KGs), with clinical entities as nodes and their relations as edges, can be extracted automatically from biomedical literature and provide complementary information to EMR data that have been found to provide valuable predictive signals. Objective: This study aims to evaluate the efficacy of collective matrix factorization (CMF), both the classical variant and a recent neural architecture called deep CMF (DCMF), in integrating heterogeneous data sources from EMR and KG to obtain patient representations for clinical decision support tasks. Methods: Using a recent formulation for obtaining graph representations through matrix factorization within the context of CMF, we infused auxiliary information during patient representation learning. We also extended the DCMF architecture to create a task-specific end-to-end model that learns to simultaneously find effective patient representations and predictions. We compared the efficacy of such a model to that of first learning unsupervised representations and then independently learning a predictive model. We evaluated patient representation learning using CMF-based methods and autoencoders for 2 clinical decision support tasks on a large EMR data set. Results: Our experiments show that DCMF provides a seamless way for integrating multiple sources of data to obtain patient representations, both in unsupervised and supervised settings. Its performance in single-source settings is comparable with that of previous autoencoder-based representation learning methods. When DCMF is used to obtain representations from a combination of EMR and KG, where most previous autoencoder-based methods cannot be used directly, its performance is superior to that of previous nonneural methods for CMF. Infusing information from KGs into patient representations using DCMF was found to improve downstream predictive performance. Conclusions: Our experiments indicate that DCMF is a versatile model that can be used to obtain representations from single and multiple data sources and combine information from EMR data and KGs. Furthermore, DCMF can be used to learn representations in both supervised and unsupervised settings. Thus, DCMF offers an effective way of integrating heterogeneous data sources and infusing auxiliary knowledge into patient representations. ", doi="10.2196/28842", url="https://medinform.jmir.org/2022/1/e28842", url="http://www.ncbi.nlm.nih.gov/pubmed/35049514" } @Article{info:doi/10.2196/30557, author="Vaidyam, Aditya and Halamka, John and Torous, John", title="Enabling Research and Clinical Use of Patient-Generated Health Data (the mindLAMP Platform): Digital Phenotyping Study", journal="JMIR Mhealth Uhealth", year="2022", month="Jan", day="7", volume="10", number="1", pages="e30557", keywords="digital phenotyping", keywords="mHealth", keywords="apps", keywords="FHIR", keywords="digital health", keywords="health data", keywords="patient-generated health data", keywords="mobile health", keywords="smartphones", keywords="wearables", keywords="mobile apps", keywords="mental health, mobile phone", abstract="Background: There is a growing need for the integration of patient-generated health data (PGHD) into research and clinical care to enable personalized, preventive, and interactive care, but technical and organizational challenges, such as the lack of standards and easy-to-use tools, preclude the effective use of PGHD generated from consumer devices, such as smartphones and wearables. Objective: This study outlines how we used mobile apps and semantic web standards such as HTTP 2.0, Representational State Transfer, JSON (JavaScript Object Notation), JSON Schema, Transport Layer Security (version 1.3), Advanced Encryption Standard-256, OpenAPI, HTML5, and Vega, in conjunction with patient and provider feedback to completely update a previous version of mindLAMP. Methods: The Learn, Assess, Manage, and Prevent (LAMP) platform addresses the abovementioned challenges in enhancing clinical insight by supporting research, data analysis, and implementation efforts around PGHD as an open-source solution with freely accessible and shared code. Results: With a simplified programming interface and novel data representation that captures additional metadata, the LAMP platform enables interoperability with existing Fast Healthcare Interoperability Resources--based health care systems as well as consumer wearables and services such as Apple HealthKit and Google Fit. The companion Cortex data analysis and machine learning toolkit offer robust support for artificial intelligence, behavioral feature extraction, interactive visualizations, and high-performance data processing through parallelization and vectorization techniques. Conclusions: The LAMP platform incorporates feedback from patients and clinicians alongside a standards-based approach to address these needs and functions across a wide range of use cases through its customizable and flexible components. These range from simple survey-based research to international consortiums capturing multimodal data to simple delivery of mindfulness exercises through personalized, just-in-time adaptive interventions. ", doi="10.2196/30557", url="https://mhealth.jmir.org/2022/1/e30557", url="http://www.ncbi.nlm.nih.gov/pubmed/34994710" } @Article{info:doi/10.2196/30720, author="Wang, Ni and Wang, Muyu and Zhou, Yang and Liu, Honglei and Wei, Lan and Fei, Xiaolu and Chen, Hui", title="Sequential Data--Based Patient Similarity Framework for Patient Outcome Prediction: Algorithm Development", journal="J Med Internet Res", year="2022", month="Jan", day="6", volume="24", number="1", pages="e30720", keywords="patient similarity", keywords="electronic medical records", keywords="time series", keywords="acute myocardial infarction", keywords="natural language processing", keywords="machine learning", keywords="deep learning", keywords="outcome prediction", keywords="informatics", keywords="health data", abstract="Background: Sequential information in electronic medical records is valuable and helpful for patient outcome prediction but is rarely used for patient similarity measurement because of its unevenness, irregularity, and heterogeneity. Objective: We aimed to develop a patient similarity framework for patient outcome prediction that makes use of sequential and cross-sectional information in electronic medical record systems. Methods: Sequence similarity was calculated from timestamped event sequences using edit distance, and trend similarity was calculated from time series using dynamic time warping and Haar decomposition. We also extracted cross-sectional information, namely, demographic, laboratory test, and radiological report data, for additional similarity calculations. We validated the effectiveness of the framework by constructing k--nearest neighbors classifiers to predict mortality and readmission for acute myocardial infarction patients, using data from (1) a public data set and (2) a private data set, at 3 time points---at admission, on Day 7, and at discharge---to provide early warning patient outcomes. We also constructed state-of-the-art Euclidean-distance k--nearest neighbor, logistic regression, random forest, long short-term memory network, and recurrent neural network models, which were used for comparison. Results: With all available information during a hospitalization episode, predictive models using the similarity model outperformed baseline models based on both public and private data sets. For mortality predictions, all models except for the logistic regression model showed improved performances over time. There were no such increasing trends in predictive performances for readmission predictions. The random forest and logistic regression models performed best for mortality and readmission predictions, respectively, when using information from the first week after admission. Conclusions: For patient outcome predictions, the patient similarity framework facilitated sequential similarity calculations for uneven electronic medical record data and helped improve predictive performance. ", doi="10.2196/30720", url="https://www.jmir.org/2022/1/e30720", url="http://www.ncbi.nlm.nih.gov/pubmed/34989682" } @Article{info:doi/10.2196/34567, author="Santonen, Teemu and Petsani, Despoina and Julin, Mikko and Garschall, Markus and Kropf, Johannes and Van der Auwera, Vicky and Bernaerts, Sylvie and Losada, Raquel and Almeida, Rosa and Garatea, Jokin and Mu{\~n}oz, Idoia and Nagy, Eniko and Kehayia, Eva and de Guise, Elaine and Nadeau, Sylvie and Azevedo, Nancy and Segkouli, Sofia and Lazarou, Ioulietta and Petronikolou, Vasileia and Bamidis, Panagiotis and Konstantinidis, Evdokimos", title="Cocreating a Harmonized Living Lab for Big Data--Driven Hybrid Persona Development: Protocol for Cocreating, Testing, and Seeking Consensus", journal="JMIR Res Protoc", year="2022", month="Jan", day="6", volume="11", number="1", pages="e34567", keywords="Living Lab", keywords="everyday living", keywords="technology", keywords="big data", keywords="harmonization", keywords="personas", keywords="small-scale real-life testing", keywords="mobile phone", abstract="Background: Living Labs are user-centered, open innovation ecosystems based on a systematic user cocreation approach, which integrates research and innovation processes in real-life communities and settings. The Horizon 2020 Project VITALISE (Virtual Health and Wellbeing Living Lab Infrastructure) unites 19 partners across 11 countries. The project aims to harmonize Living Lab procedures and enable effective and convenient transnational and virtual access to key European health and well-being research infrastructures, which are governed by Living Labs. The VITALISE consortium will conduct joint research activities in the fields included in the care pathway of patients: rehabilitation, transitional care, and everyday living environments for older adults. This protocol focuses on health and well-being research in everyday living environments. Objective: The main aim of this study is to cocreate and test a harmonized research protocol for developing big data--driven hybrid persona, which are hypothetical user archetypes created to represent a user community. In addition, the use and applicability of innovative technologies will be investigated in the context of various everyday living and Living Lab environments. Methods: In phase 1, surveys and structured interviews will be used to identify the most suitable Living Lab methods, tools, and instruments for health-related research among VITALISE project Living Labs (N=10). A series of web-based cocreation workshops and iterative cowriting processes will be applied to define the initial protocols. In phase 2, five small-scale case studies will be conducted to test the cocreated research protocols in various real-life everyday living settings and Living Lab infrastructures. In phase 3, a cross-case analysis grounded on semistructured interviews will be conducted to identify the challenges and benefits of using the proposed research protocols. Furthermore, a series of cocreation workshops and the consensus seeking Delphi study process will be conducted in parallel to cocreate and validate the acceptance of the defined harmonized research protocols among wider Living Lab communities. Results: As of September 30, 2021, project deliverables Ethics and safety manual and Living lab standard version 1 have been submitted to the European Commission review process. The study will be finished by March 2024. Conclusions: The outcome of this research will lead to harmonized procedures and protocols in the context of big data--driven hybrid persona development among health and well-being Living Labs in Europe and beyond. Harmonized protocols enable Living Labs to exploit similar research protocols, devices, hardware, and software for interventions and complex data collection purposes. Economies of scale and improved use of resources will speed up and improve research quality and offer novel possibilities for open data sharing, multidisciplinary research, and comparative studies beyond current practices. Case studies will also provide novel insights for implementing innovative technologies in the context of everyday Living Lab research. International Registered Report Identifier (IRRID): DERR1-10.2196/34567 ", doi="10.2196/34567", url="https://www.researchprotocols.org/2022/1/e34567", url="http://www.ncbi.nlm.nih.gov/pubmed/34989697" } @Article{info:doi/10.2196/27008, author="Yao, Li-Hung and Leung, Ka-Chun and Tsai, Chu-Lin and Huang, Chien-Hua and Fu, Li-Chen", title="A Novel Deep Learning--Based System for Triage in the Emergency Department Using Electronic Medical Records: Retrospective Cohort Study", journal="J Med Internet Res", year="2021", month="Dec", day="27", volume="23", number="12", pages="e27008", keywords="emergency department", keywords="triage system", keywords="deep learning", keywords="hospital admission", keywords="data to text", keywords="electronic health record", abstract="Background: Emergency department (ED) crowding has resulted in delayed patient treatment and has become a universal health care problem. Although a triage system, such as the 5-level emergency severity index, somewhat improves the process of ED treatment, it still heavily relies on the nurse's subjective judgment and triages too many patients to emergency severity index level 3 in current practice. Hence, a system that can help clinicians accurately triage a patient's condition is imperative. Objective: This study aims to develop a deep learning--based triage system using patients' ED electronic medical records to predict clinical outcomes after ED treatments. Methods: We conducted a retrospective study using data from an open data set from the National Hospital Ambulatory Medical Care Survey from 2012 to 2016 and data from a local data set from the National Taiwan University Hospital from 2009 to 2015. In this study, we transformed structured data into text form and used convolutional neural networks combined with recurrent neural networks and attention mechanisms to accomplish the classification task. We evaluated our performance using area under the receiver operating characteristic curve (AUROC). Results: A total of 118,602 patients from the National Hospital Ambulatory Medical Care Survey were included in this study for predicting hospitalization, and the accuracy and AUROC were 0.83 and 0.87, respectively. On the other hand, an external experiment was to use our own data set from the National Taiwan University Hospital that included 745,441 patients, where the accuracy and AUROC were similar, that is, 0.83 and 0.88, respectively. Moreover, to effectively evaluate the prediction quality of our proposed system, we also applied the model to other clinical outcomes, including mortality and admission to the intensive care unit, and the results showed that our proposed method was approximately 3\% to 5\% higher in accuracy than other conventional methods. Conclusions: Our proposed method achieved better performance than the traditional method, and its implementation is relatively easy, it includes commonly used variables, and it is better suited for real-world clinical settings. It is our future work to validate our novel deep learning--based triage algorithm with prospective clinical trials, and we hope to use it to guide resource allocation in a busy ED once the validation succeeds. ", doi="10.2196/27008", url="https://www.jmir.org/2021/12/e27008", url="http://www.ncbi.nlm.nih.gov/pubmed/34958305" } @Article{info:doi/10.2196/28632, author="Chopard, Daphne and Treder, S. Matthias and Corcoran, Padraig and Ahmed, Nagheen and Johnson, Claire and Busse, Monica and Spasic, Irena", title="Text Mining of Adverse Events in Clinical Trials: Deep Learning Approach", journal="JMIR Med Inform", year="2021", month="Dec", day="24", volume="9", number="12", pages="e28632", keywords="natural language processing", keywords="deep learning", keywords="machine learning", keywords="classification", abstract="Background: Pharmacovigilance and safety reporting, which involve processes for monitoring the use of medicines in clinical trials, play a critical role in the identification of previously unrecognized adverse events or changes in the patterns of adverse events. Objective: This study aims to demonstrate the feasibility of automating the coding of adverse events described in the narrative section of the serious adverse event report forms to enable statistical analysis of the aforementioned patterns. Methods: We used the Uni?ed Medical Language System (UMLS) as the coding scheme, which integrates 217 source vocabularies, thus enabling coding against other relevant terminologies such as the International Classification of Diseases--10th Revision, Medical Dictionary for Regulatory Activities, and Systematized Nomenclature of Medicine). We used MetaMap, a highly configurable dictionary lookup software, to identify the mentions of the UMLS concepts. We trained a binary classifier using Bidirectional Encoder Representations from Transformers (BERT), a transformer-based language model that captures contextual relationships, to differentiate between mentions of the UMLS concepts that represented adverse events and those that did not. Results: The model achieved a high F1 score of 0.8080, despite the class imbalance. This is 10.15 percent points lower than human-like performance but also 17.45 percent points higher than that of the baseline approach. Conclusions: These results confirmed that automated coding of adverse events described in the narrative section of serious adverse event reports is feasible. Once coded, adverse events can be statistically analyzed so that any correlations with the trialed medicines can be estimated in a timely fashion. ", doi="10.2196/28632", url="https://medinform.jmir.org/2021/12/e28632", url="http://www.ncbi.nlm.nih.gov/pubmed/34951601" } @Article{info:doi/10.2196/31618, author="Cho, Sylvia and Weng, Chunhua and Kahn, G. Michael and Natarajan, Karthik", title="Identifying Data Quality Dimensions for Person-Generated Wearable Device Data: Multi-Method Study", journal="JMIR Mhealth Uhealth", year="2021", month="Dec", day="23", volume="9", number="12", pages="e31618", keywords="patient-generated health data", keywords="data accuracy", keywords="data quality", keywords="wearable device", keywords="fitness trackers", keywords="qualitative research", abstract="Background: There is a growing interest in using person-generated wearable device data for biomedical research, but there are also concerns regarding the quality of data such as missing or incorrect data. This emphasizes the importance of assessing data quality before conducting research. In order to perform data quality assessments, it is essential to define what data quality means for person-generated wearable device data by identifying the data quality dimensions. Objective: This study aims to identify data quality dimensions for person-generated wearable device data for research purposes. Methods: This study was conducted in 3 phases: literature review, survey, and focus group discussion. The literature review was conducted following the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guideline to identify factors affecting data quality and its associated data quality challenges. In addition, we conducted a survey to confirm and complement results from the literature review and to understand researchers' perceptions on data quality dimensions that were previously identified as dimensions for the secondary use of electronic health record (EHR) data. We sent the survey to researchers with experience in analyzing wearable device data. Focus group discussion sessions were conducted with domain experts to derive data quality dimensions for person-generated wearable device data. On the basis of the results from the literature review and survey, a facilitator proposed potential data quality dimensions relevant to person-generated wearable device data, and the domain experts accepted or rejected the suggested dimensions. Results: In total, 19 studies were included in the literature review, and 3 major themes emerged: device- and technical-related, user-related, and data governance--related factors. The associated data quality problems were incomplete data, incorrect data, and heterogeneous data. A total of 20 respondents answered the survey. The major data quality challenges faced by researchers were completeness, accuracy, and plausibility. The importance ratings on data quality dimensions in an existing framework showed that the dimensions for secondary use of EHR data are applicable to person-generated wearable device data. There were 3 focus group sessions with domain experts in data quality and wearable device research. The experts concluded that intrinsic data quality features, such as conformance, completeness, and plausibility, and contextual and fitness-for-use data quality features, such as completeness (breadth and density) and temporal data granularity, are important data quality dimensions for assessing person-generated wearable device data for research purposes. Conclusions: In this study, intrinsic and contextual and fitness-for-use data quality dimensions for person-generated wearable device data were identified. The dimensions were adapted from data quality terminologies and frameworks for the secondary use of EHR data with a few modifications. Further research on how data quality can be assessed with respect to each dimension is needed. ", doi="10.2196/31618", url="https://mhealth.jmir.org/2021/12/e31618", url="http://www.ncbi.nlm.nih.gov/pubmed/34941540" } @Article{info:doi/10.2196/19250, author="Yeng, Kandabongee Prosper and Nweke, Obiora Livinus and Yang, Bian and Ali Fauzi, Muhammad and Snekkenes, Arthur Einar", title="Artificial Intelligence--Based Framework for Analyzing Health Care Staff Security Practice: Mapping Review and Simulation Study", journal="JMIR Med Inform", year="2021", month="Dec", day="22", volume="9", number="12", pages="e19250", keywords="artificial intelligence", keywords="machine learning", keywords="health care", keywords="security practice", keywords="framework", keywords="security", keywords="modeling", keywords="analysis", abstract="Background: Blocklisting malicious activities in health care is challenging in relation to access control in health care security practices due to the fear of preventing legitimate access for therapeutic reasons. Inadvertent prevention of legitimate access can contravene the availability trait of the confidentiality, integrity, and availability triad, and may result in worsening health conditions, leading to serious consequences, including deaths. Therefore, health care staff are often provided with a wide range of access such as a ``breaking-the-glass'' or ``self-authorization'' mechanism for emergency access. However, this broad access can undermine the confidentiality and integrity of sensitive health care data because breaking-the-glass can lead to vast unauthorized access, which could be problematic when determining illegitimate access in security practices. Objective: A review was performed to pinpoint appropriate artificial intelligence (AI) methods and data sources that can be used for effective modeling and analysis of health care staff security practices. Based on knowledge obtained from the review, a framework was developed and implemented with simulated data to provide a comprehensive approach toward effective modeling and analyzing security practices of health care staff in real access logs. Methods: The flow of our approach was a mapping review to provide AI methods, data sources and their attributes, along with other categories as input for framework development. To assess implementation of the framework, electronic health record (EHR) log data were simulated and analyzed, and the performance of various approaches in the framework was compared. Results: Among the total 130 articles initially identified, 18 met the inclusion and exclusion criteria. A thorough assessment and analysis of the included articles revealed that K-nearest neighbor, Bayesian network, and decision tree (C4.5) algorithms were predominantly applied to EHR and network logs with varying input features of health care staff security practices. Based on the review results, a framework was developed and implemented with simulated logs. The decision tree obtained the best precision of 0.655, whereas the best recall was achieved by the support vector machine (SVM) algorithm at 0.977. However, the best F1-score was obtained by random forest at 0.775. In brief, three classifiers (random forest, decision tree, and SVM) in the two-class approach achieved the best precision of 0.998. Conclusions: The security practices of health care staff can be effectively analyzed using a two-class approach to detect malicious and nonmalicious security practices. Based on our comparative study, the algorithms that can effectively be used in related studies include random forest, decision tree, and SVM. Deviations of security practices from required health care staff's security behavior in the big data context can be analyzed with real access logs to define appropriate incentives for improving conscious care security practice. ", doi="10.2196/19250", url="https://medinform.jmir.org/2021/12/e19250", url="http://www.ncbi.nlm.nih.gov/pubmed/34941549" } @Article{info:doi/10.2196/30970, author="Paris, Nicolas and Lamer, Antoine and Parrot, Adrien", title="Transformation and Evaluation of the MIMIC Database in the OMOP Common Data Model: Development and Usability Study", journal="JMIR Med Inform", year="2021", month="Dec", day="14", volume="9", number="12", pages="e30970", keywords="data reuse", keywords="open data", keywords="OMOP", keywords="common data model", keywords="critical care", keywords="machine learning", keywords="big data", keywords="health informatics", keywords="health data", keywords="health database", keywords="electronic health records", keywords="open access database", keywords="digital health", keywords="intensive care", keywords="health care", abstract="Background: In the era of big data, the intensive care unit (ICU) is likely to benefit from real-time computer analysis and modeling based on close patient monitoring and electronic health record data. The Medical Information Mart for Intensive Care (MIMIC) is the first open access database in the ICU domain. Many studies have shown that common data models (CDMs) improve database searching by allowing code, tools, and experience to be shared. The Observational Medical Outcomes Partnership (OMOP) CDM is spreading all over the world. Objective: The objective was to transform MIMIC into an OMOP database and to evaluate the benefits of this transformation for analysts. Methods: We transformed MIMIC (version 1.4.21) into OMOP format (version 5.3.3.1) through semantic and structural mapping. The structural mapping aimed at moving the MIMIC data into the right place in OMOP, with some data transformations. The mapping was divided into 3 phases: conception, implementation, and evaluation. The conceptual mapping aimed at aligning the MIMIC local terminologies to OMOP's standard ones. It consisted of 3 phases: integration, alignment, and evaluation. A documented, tested, versioned, exemplified, and open repository was set up to support the transformation and improvement of the MIMIC community's source code. The resulting data set was evaluated over a 48-hour datathon. Results: With an investment of 2 people for 500 hours, 64\% of the data items of the 26 MIMIC tables were standardized into the OMOP CDM and 78\% of the source concepts mapped to reference terminologies. The model proved its ability to support community contributions and was well received during the datathon, with 160 participants and 15,000 requests executed with a maximum duration of 1 minute. Conclusions: The resulting MIMIC-OMOP data set is the first MIMIC-OMOP data set available free of charge with real disidentified data ready for replicable intensive care research. This approach can be generalized to any medical field. ", doi="10.2196/30970", url="https://medinform.jmir.org/2021/12/e30970", url="http://www.ncbi.nlm.nih.gov/pubmed/34904958" } @Article{info:doi/10.2196/29286, author="Bannay, Aur{\'e}lie and Bories, Mathilde and Le Corre, Pascal and Riou, Christine and Lemordant, Pierre and Van Hille, Pascal and Chazard, Emmanuel and Dode, Xavier and Cuggia, Marc and Bouzill{\'e}, Guillaume", title="Leveraging National Claims and Hospital Big Data: Cohort Study on a Statin-Drug Interaction Use Case", journal="JMIR Med Inform", year="2021", month="Dec", day="13", volume="9", number="12", pages="e29286", keywords="drug interactions", keywords="statins", keywords="administrative claims", keywords="health care", keywords="big data", keywords="data linking", keywords="data warehousing", abstract="Background: Linking different sources of medical data is a promising approach to analyze care trajectories. The aim of the INSHARE (Integrating and Sharing Health Big Data for Research) project was to provide the blueprint for a technological platform that facilitates integration, sharing, and reuse of data from 2 sources: the clinical data warehouse (CDW) of the Rennes academic hospital, called eHOP (entrep{\^o}t H{\^o}pital), and a data set extracted from the French national claim data warehouse (Syst{\`e}me National des Donn{\'e}es de Sant{\'e} [SNDS]). Objective: This study aims to demonstrate how the INSHARE platform can support big data analytic tasks in the health field using a pharmacovigilance use case based on statin consumption and statin-drug interactions. Methods: A Spark distributed cluster-computing framework was used for the record linkage procedure and all analyses. A semideterministic record linkage method based on the common variables between the chosen data sources was developed to identify all patients discharged after at least one hospital stay at the Rennes academic hospital between 2015 and 2017. The use-case study focused on a cohort of patients treated with statins prescribed by their general practitioner or during their hospital stay. Results: The whole process (record linkage procedure and use-case analyses) required 88 minutes. Of the 161,532 and 164,316 patients from the SNDS and eHOP CDW data sets, respectively, 159,495 patients were successfully linked (98.74\% and 97.07\% of patients from SNDS and eHOP CDW, respectively). Of the 16,806 patients with at least one statin delivery, 8293 patients started the consumption before and continued during the hospital stay, 6382 patients stopped statin consumption at hospital admission, and 2131 patients initiated statins in hospital. Statin-drug interactions occurred more frequently during hospitalization than in the community (3800/10,424, 36.45\% and 3253/14,675, 22.17\%, respectively; P<.001). Only 121 patients had the most severe level of statin-drug interaction. Hospital stay burden (length of stay and in-hospital mortality) was more severe in patients with statin-drug interactions during hospitalization. Conclusions: This study demonstrates the added value of combining and reusing clinical and claim data to provide large-scale measures of drug-drug interaction prevalence and care pathways outside hospitals. It builds a path to move the current health care system toward a Learning Health System using knowledge generated from research on real-world health data. ", doi="10.2196/29286", url="https://medinform.jmir.org/2021/12/e29286", url="http://www.ncbi.nlm.nih.gov/pubmed/34898457" } @Article{info:doi/10.2196/27984, author="Janssen, Anna and Talic, Stella and Gasevic, Dragan and Kay, Judy and Shaw, Tim", title="Exploring the Intersection Between Health Professionals' Learning and eHealth Data: Protocol for a Comprehensive Research Program in Practice Analytics in Health Care", journal="JMIR Res Protoc", year="2021", month="Dec", day="9", volume="10", number="12", pages="e27984", keywords="digital health", keywords="health informatics", keywords="practice analytics in health care", keywords="health professions education", keywords="continuing professional development", abstract="Background: There is an increasing amount of electronic data sitting within the health system. These data have untapped potential to improve clinical practice if extracted efficiently and harnessed to change the behavior of health professionals. Furthermore, there is an increasing expectation from the government and peak bodies that both individual health professionals and health care organizations will use electronic data for a range of applications, including improving health service delivery and informing clinical practice and professional accreditation. Objective: The aim of this research program is to make eHealth data captured within tertiary health care organizations more actionable to health professionals for use in practice reflection, professional development, and other quality improvement activities. Methods: A multidisciplinary approach was used to connect academic experts from core disciplines of health and medicine, education and learning sciences, and engineering and information communication technology with government and health service partners to identify key problems preventing the health care industry from using electronic data to support health professional learning. This multidisciplinary approach was used to design a large-scale research program to solve the problem of making eHealth data more accessible to health professionals for practice reflection. The program will be delivered over 5 years by doctoral candidates undertaking research projects with discrete aims that run in parallel to achieving this program's objectives. Results: The process used to develop the research program identified 7 doctoral research projects to answer the program objectives, split across 3 streams. Conclusions: This research program has the potential to successfully unpack electronic data siloed within clinical sites and enable health professionals to use them to reflect on their practice and deliver informed and improved care. The program will contribute to current practices by fostering stronger connections between industry and academia, interlinking doctoral research projects to solve complex problems, and creating new knowledge for clinical sites on how data can be used to understand and improve performance. Furthermore, the program aims to affect policy by developing insights on how professional development programs may be strengthened to enhance their alignment with clinical practice. The key contributions of this paper include the introduction of a new conceptualized research program, Practice Analytics in Health care, by describing the foundational academic disciplines that the program is formed of and presenting scientific methods for its design and development. International Registered Report Identifier (IRRID): PRR1-10.2196/27984 ", doi="10.2196/27984", url="https://www.researchprotocols.org/2021/12/e27984", url="http://www.ncbi.nlm.nih.gov/pubmed/34889768" } @Article{info:doi/10.2196/28305, author="Ng, Reuben", title="Anti-Asian Sentiments During the COVID-19 Pandemic Across 20 Countries: Analysis of a 12-Billion-Word News Media Database", journal="J Med Internet Res", year="2021", month="Dec", day="8", volume="23", number="12", pages="e28305", keywords="racism", keywords="COVID-19", keywords="anti-Asian sentiments", keywords="psychomics", keywords="quantitative social science", keywords="culture", keywords="text as data", keywords="xenophobia", keywords="digital humanities", abstract="Background: US president Joe Biden signed an executive action directing federal agencies to combat hate crimes and racism against Asians, which have percolated during the COVID-19 pandemic. This is one of the first known empirical studies to dynamically test whether global societal sentiments toward Asians have become more negative during the COVID-19 pandemic. Objective: This study aimed to investigate whether global societal sentiments toward Asians across 20 countries have become more negative, month by month, from before the pandemic (October 2019) to May 2020, along with the pandemic (incidence and mortality rates) and cultural (Hofstede's cultural dimensions) predictors of this trend. Methods: We leveraged a 12-billion-word web-based media database, with over 30 million newspaper and magazine articles taken from over 7000 sites across 20 countries, and identified 6 synonyms of ``Asian'' that are related to the coronavirus. We compiled their most frequently used descriptors (collocates) from October 2019 to May 2020 across 20 countries, culminating in 85,827 collocates that were rated by 2 independent researchers to provide a Cumulative Asian Sentiment Score (CASS) per month. This allowed us to track significant shifts in societal sentiments toward Asians from a baseline period (October to December 2019) to the onset of the pandemic (January to May 2020). We tested the competing predictors of this trend: pandemic variables of incidence and mortality rates measured monthly for all 20 countries taken from the Oxford COVID-19 Government Response Tracker, and Hofstede's Cultural Dimensions of Individualism, Power Distance, Uncertainty Avoidance, and Masculinity for the 20 countries. Results: Before the pandemic in December 2019, Jamaica and New Zealand evidenced the most negative societal sentiments toward Asians; when news about the coronavirus was released in January 2020, the United States and Nigeria evidenced the most negative sentiments toward Asians among 20 countries. Globally, sentiments of Asians became more negative---a significant linear decline during the COVID-19 pandemic. CASS trended neutral before the pandemic during the baseline period of October to November 2019 and then plummeted in February 2020. CASS were, ironically, not predicted by COVID-19's incidence and mortality rates, but rather by Hofstede's cultural dimensions: individualism, power distance, and uncertainty avoidance---as shown by mixed models (N=28,494). Specifically, higher power distance, individualism, and uncertainty avoidance were associated with negative societal sentiments toward Asians. Conclusions: Racism, in the form of Anti-Asian sentiments, are deep-seated, and predicated on structural undercurrents of culture. The COVID-19 pandemic may have indirectly and inadvertently exacerbated societal tendencies for racism. Our study lays the important groundwork to design interventions and policy communications to ameliorate Anti-Asian racism, which are culturally nuanced and contextually appropriate. ", doi="10.2196/28305", url="https://www.jmir.org/2021/12/e28305", url="http://www.ncbi.nlm.nih.gov/pubmed/34678754" } @Article{info:doi/10.2196/32698, author="Pan, Youcheng and Wang, Chenghao and Hu, Baotian and Xiang, Yang and Wang, Xiaolong and Chen, Qingcai and Chen, Junjie and Du, Jingcheng", title="A BERT-Based Generation Model to Transform Medical Texts to SQL Queries for Electronic Medical Records: Model Development and Validation", journal="JMIR Med Inform", year="2021", month="Dec", day="8", volume="9", number="12", pages="e32698", keywords="electronic medical record", keywords="text-to-SQL generation", keywords="BERT", keywords="grammar-based decoding", keywords="tree-structured intermediate representation", abstract="Background: Electronic medical records (EMRs) are usually stored in relational databases that require SQL queries to retrieve information of interest. Effectively completing such queries can be a challenging task for medical experts due to the barriers in expertise. Existing text-to-SQL generation studies have not been fully embraced in the medical domain. Objective: The objective of this study was to propose a neural generation model that can jointly consider the characteristics of medical text and the SQL structure to automatically transform medical texts to SQL queries for EMRs. Methods: We proposed a medical text--to-SQL model (MedTS), which employed a pretrained Bidirectional Encoder Representations From Transformers model as the encoder and leveraged a grammar-based long short-term memory network as the decoder to predict the intermediate representation that can easily be transformed into the final SQL query. We adopted the syntax tree as the intermediate representation rather than directly regarding the SQL query as an ordinary word sequence, which is more in line with the tree-structure nature of SQL and can also effectively reduce the search space during generation. Experiments were conducted on the MIMICSQL dataset, and 5 competitor methods were compared. Results: Experimental results demonstrated that MedTS achieved the accuracy of 0.784 and 0.899 on the test set in terms of logic form and execution, respectively, which significantly outperformed the existing state-of-the-art methods. Further analyses proved that the performance on each component of the generated SQL was relatively balanced and offered substantial improvements. Conclusions: The proposed MedTS was effective and robust for improving the performance of medical text--to-SQL generation, indicating strong potential to be applied in the real medical scenario. ", doi="10.2196/32698", url="https://medinform.jmir.org/2021/12/e32698", url="http://www.ncbi.nlm.nih.gov/pubmed/34889749" } @Article{info:doi/10.2196/25022, author="Singh, Janmajay and Sato, Masahiro and Ohkuma, Tomoko", title="On Missingness Features in Machine Learning Models for Critical Care: Observational Study", journal="JMIR Med Inform", year="2021", month="Dec", day="8", volume="9", number="12", pages="e25022", keywords="electronic health records", keywords="informative missingness", keywords="machine learning", keywords="missing data", keywords="hospital mortality", keywords="sepsis", abstract="Background: Missing data in electronic health records is inevitable and considered to be nonrandom. Several studies have found that features indicating missing patterns (missingness) encode useful information about a patient's health and advocate for their inclusion in clinical prediction models. But their effectiveness has not been comprehensively evaluated. Objective: The goal of the research is to study the effect of including informative missingness features in machine learning models for various clinically relevant outcomes and explore robustness of these features across patient subgroups and task settings. Methods: A total of 48,336 electronic health records from the 2012 and 2019 PhysioNet Challenges were used, and mortality, length of stay, and sepsis outcomes were chosen. The latter dataset was multicenter, allowing external validation. Gated recurrent units were used to learn sequential patterns in the data and classify or predict labels of interest. Models were evaluated on various criteria and across population subgroups evaluating discriminative ability and calibration. Results: Generally improved model performance in retrospective tasks was observed on including missingness features. Extent of improvement depended on the outcome of interest (area under the curve of the receiver operating characteristic [AUROC] improved from 1.2\% to 7.7\%) and even patient subgroup. However, missingness features did not display utility in a simulated prospective setting, being outperformed (0.9\% difference in AUROC) by the model relying only on pathological features. This was despite leading to earlier detection of disease (true positives), since including these features led to a concomitant rise in false positive detections. Conclusions: This study comprehensively evaluated effectiveness of missingness features on machine learning models. A detailed understanding of how these features affect model performance may lead to their informed use in clinical settings especially for administrative tasks like length of stay prediction where they present the greatest benefit. While missingness features, representative of health care processes, vary greatly due to intra- and interhospital factors, they may still be used in prediction models for clinically relevant outcomes. However, their use in prospective models producing frequent predictions needs to be explored further. ", doi="10.2196/25022", url="https://medinform.jmir.org/2021/12/e25022", url="http://www.ncbi.nlm.nih.gov/pubmed/34889756" } @Article{info:doi/10.2196/26407, author="Wu, Hong and Ji, Jiatong and Tian, Haimei and Chen, Yao and Ge, Weihong and Zhang, Haixia and Yu, Feng and Zou, Jianjun and Nakamura, Mitsuhiro and Liao, Jun", title="Chinese-Named Entity Recognition From Adverse Drug Event Records: Radical Embedding-Combined Dynamic Embedding--Based BERT in a Bidirectional Long Short-term Conditional Random Field (Bi-LSTM-CRF) Model", journal="JMIR Med Inform", year="2021", month="Dec", day="1", volume="9", number="12", pages="e26407", keywords="deep learning", keywords="BERT", keywords="adverse drug reaction", keywords="named entity recognition", keywords="electronic medical records", abstract="Background: With the increasing variety of drugs, the incidence of adverse drug events (ADEs) is increasing year by year. Massive numbers of ADEs are recorded in electronic medical records and adverse drug reaction (ADR) reports, which are important sources of potential ADR information. Meanwhile, it is essential to make latent ADR information automatically available for better postmarketing drug safety reevaluation and pharmacovigilance. Objective: This study describes how to identify ADR-related information from Chinese ADE reports. Methods: Our study established an efficient automated tool, named BBC-Radical. BBC-Radical is a model that consists of 3 components: Bidirectional Encoder Representations from Transformers (BERT), bidirectional long short-term memory (bi-LSTM), and conditional random field (CRF). The model identifies ADR-related information from Chinese ADR reports. Token features and radical features of Chinese characters were used to represent the common meaning of a group of words. BERT and Bi-LSTM-CRF were novel models that combined these features to conduct named entity recognition (NER) tasks in the free-text section of 24,890 ADR reports from the Jiangsu Province Adverse Drug Reaction Monitoring Center from 2010 to 2016. Moreover, the man-machine comparison experiment on the ADE records from Drum Tower Hospital was designed to compare the NER performance between the BBC-Radical model and a manual method. Results: The NER model achieved relatively high performance, with a precision of 96.4\%, recall of 96.0\%, and F1 score of 96.2\%. This indicates that the performance of the BBC-Radical model (precision 87.2\%, recall 85.7\%, and F1 score 86.4\%) is much better than that of the manual method (precision 86.1\%, recall 73.8\%, and F1 score 79.5\%) in the recognition task of each kind of entity. Conclusions: The proposed model was competitive in extracting ADR-related information from ADE reports, and the results suggest that the application of our method to extract ADR-related information is of great significance in improving the quality of ADR reports and postmarketing drug safety evaluation. ", doi="10.2196/26407", url="https://medinform.jmir.org/2021/12/e26407", url="http://www.ncbi.nlm.nih.gov/pubmed/34855616" } @Article{info:doi/10.2196/30308, author="St{\"o}hr, R. Mark and G{\"u}nther, Andreas and Majeed, W. Raphael", title="The Collaborative Metadata Repository (CoMetaR) Web App: Quantitative and Qualitative Usability Evaluation", journal="JMIR Med Inform", year="2021", month="Nov", day="29", volume="9", number="11", pages="e30308", keywords="usability", keywords="metadata", keywords="data visualization", keywords="semantic web", keywords="data management", keywords="data warehousing", keywords="communication barriers", keywords="quality improvement", keywords="biological ontologies", keywords="data curation", abstract="Background: In the field of medicine and medical informatics, the importance of comprehensive metadata has long been recognized, and the composition of metadata has become its own field of profession and research. To ensure sustainable and meaningful metadata are maintained, standards and guidelines such as the FAIR (Findability, Accessibility, Interoperability, Reusability) principles have been published. The compilation and maintenance of metadata is performed by field experts supported by metadata management apps. The usability of these apps, for example, in terms of ease of use, efficiency, and error tolerance, crucially determines their benefit to those interested in the data. Objective: This study aims to provide a metadata management app with high usability that assists scientists in compiling and using rich metadata. We aim to evaluate our recently developed interactive web app for our collaborative metadata repository (CoMetaR). This study reflects how real users perceive the app by assessing usability scores and explicit usability issues. Methods: We evaluated the CoMetaR web app by measuring the usability of 3 modules: core module, provenance module, and data integration module. We defined 10 tasks in which users must acquire information specific to their user role. The participants were asked to complete the tasks in a live web meeting. We used the System Usability Scale questionnaire to measure the usability of the app. For qualitative analysis, we applied a modified think aloud method with the following thematic analysis and categorization into the ISO 9241-110 usability categories. Results: A total of 12 individuals participated in the study. We found that over 97\% (85/88) of all the tasks were completed successfully. We measured usability scores of 81, 81, and 72 for the 3 evaluated modules. The qualitative analysis resulted in 24 issues with the app. Conclusions: A usability score of 81 implies very good usability for the 2 modules, whereas a usability score of 72 still indicates acceptable usability for the third module. We identified 24 issues that serve as starting points for further development. Our method proved to be effective and efficient in terms of effort and outcome. It can be adapted to evaluate apps within the medical informatics field and potentially beyond. ", doi="10.2196/30308", url="https://medinform.jmir.org/2021/11/e30308", url="http://www.ncbi.nlm.nih.gov/pubmed/34847059" } @Article{info:doi/10.2196/29838, author="Saqib, Kiran and Khan, Fozia Amber and Butt, Ahmad Zahid", title="Machine Learning Methods for Predicting Postpartum Depression: Scoping Review", journal="JMIR Ment Health", year="2021", month="Nov", day="24", volume="8", number="11", pages="e29838", keywords="machine learning", keywords="postpartum depression", keywords="big data", keywords="mobile phone", abstract="Background: Machine learning (ML) offers vigorous statistical and probabilistic techniques that can successfully predict certain clinical conditions using large volumes of data. A review of ML and big data research analytics in maternal depression is pertinent and timely, given the rapid technological developments in recent years. Objective: This study aims to synthesize the literature on ML and big data analytics for maternal mental health, particularly the prediction of postpartum depression (PPD). Methods: We used a scoping review methodology using the Arksey and O'Malley framework to rapidly map research activity in ML for predicting PPD. Two independent researchers searched PsycINFO, PubMed, IEEE Xplore, and the ACM Digital Library in September 2020 to identify relevant publications in the past 12 years. Data were extracted from the articles' ML model, data type, and study results. Results: A total of 14 studies were identified. All studies reported the use of supervised learning techniques to predict PPD. Support vector machine and random forest were the most commonly used algorithms in addition to Naive Bayes, regression, artificial neural network, decision trees, and XGBoost (Extreme Gradient Boosting). There was considerable heterogeneity in the best-performing ML algorithm across the selected studies. The area under the receiver operating characteristic curve values reported for different algorithms were support vector machine (range 0.78-0.86), random forest method (0.88), XGBoost (0.80), and logistic regression (0.93). Conclusions: ML algorithms can analyze larger data sets and perform more advanced computations, which can significantly improve the detection of PPD at an early stage. Further clinical research collaborations are required to fine-tune ML algorithms for prediction and treatment. ML might become part of evidence-based practice in addition to clinical knowledge and existing research evidence. ", doi="10.2196/29838", url="https://mental.jmir.org/2021/11/e29838", url="http://www.ncbi.nlm.nih.gov/pubmed/34822337" } @Article{info:doi/10.2196/31750, author="Gierend, Kerstin and Kr{\"u}ger, Frank and Waltemath, Dagmar and F{\"u}nfgeld, Maximilian and Ganslandt, Thomas and Zeleke, Alamirrew Atinkut", title="Approaches and Criteria for Provenance in Biomedical Data Sets and Workflows: Protocol for a Scoping Review", journal="JMIR Res Protoc", year="2021", month="Nov", day="22", volume="10", number="11", pages="e31750", keywords="provenance", keywords="biomedical", keywords="workflow", keywords="data sharing", keywords="lineage", keywords="scoping review", keywords="data genesis", keywords="scientific data", keywords="digital objects", keywords="healthcare data", abstract="Background: Provenance supports the understanding of data genesis, and it is a key factor to ensure the trustworthiness of digital objects containing (sensitive) scientific data. Provenance information contributes to a better understanding of scientific results and fosters collaboration on existing data as well as data sharing. This encompasses defining comprehensive concepts and standards for transparency and traceability, reproducibility, validity, and quality assurance during clinical and scientific data workflows and research. Objective: The aim of this scoping review is to investigate existing evidence regarding approaches and criteria for provenance tracking as well as disclosing current knowledge gaps in the biomedical domain. This review covers modeling aspects as well as metadata frameworks for meaningful and usable provenance information during creation, collection, and processing of (sensitive) scientific biomedical data. This review also covers the examination of quality aspects of provenance criteria. Methods: This scoping review will follow the methodological framework by Arksey and O'Malley. Relevant publications will be obtained by querying PubMed and Web of Science. All papers in English language will be included, published between January 1, 2006 and March 23, 2021. Data retrieval will be accompanied by manual search for grey literature. Potential publications will then be exported into a reference management software, and duplicates will be removed. Afterwards, the obtained set of papers will be transferred into a systematic review management tool. All publications will be screened, extracted, and analyzed: title and abstract screening will be carried out by 4 independent reviewers. Majority vote is required for consent to eligibility of papers based on the defined inclusion and exclusion criteria. Full-text reading will be performed independently by 2 reviewers and in the last step, key information will be extracted on a pretested template. If agreement cannot be reached, the conflict will be resolved by a domain expert. Charted data will be analyzed by categorizing and summarizing the individual data items based on the research questions. Tabular or graphical overviews will be given, if applicable. Results: The reporting follows the extension of the Preferred Reporting Items for Systematic reviews and Meta-Analyses statements for Scoping Reviews. Electronic database searches in PubMed and Web of Science resulted in 469 matches after deduplication. As of September 2021, the scoping review is in the full-text screening stage. The data extraction using the pretested charting template will follow the full-text screening stage. We expect the scoping review report to be completed by February 2022. Conclusions: Information about the origin of healthcare data has a major impact on the quality and the reusability of scientific results as well as follow-up activities. This protocol outlines plans for a scoping review that will provide information about current approaches, challenges, or knowledge gaps with provenance tracking in biomedical sciences. International Registered Report Identifier (IRRID): DERR1-10.2196/31750 ", doi="10.2196/31750", url="https://www.researchprotocols.org/2021/11/e31750", url="http://www.ncbi.nlm.nih.gov/pubmed/34813494" } @Article{info:doi/10.2196/30277, author="Yang, Yujie and Zheng, Jing and Du, Zhenzhen and Li, Ye and Cai, Yunpeng", title="Accurate Prediction of Stroke for Hypertensive Patients Based on Medical Big Data and Machine Learning Algorithms: Retrospective Study", journal="JMIR Med Inform", year="2021", month="Nov", day="10", volume="9", number="11", pages="e30277", keywords="stroke", keywords="medical big data", keywords="electronic health records", keywords="machine learning", keywords="risk prediction", keywords="hypertension", abstract="Background: Stroke risk assessment is an important means of primary prevention, but the applicability of existing stroke risk assessment scales in the Chinese population has always been controversial. A prospective study is a common method of medical research, but it is time-consuming and labor-intensive. Medical big data has been demonstrated to promote disease risk factor discovery and prognosis, attracting broad research interest. Objective: We aimed to establish a high-precision stroke risk prediction model for hypertensive patients based on historical electronic medical record data and machine learning algorithms. Methods: Based on the Shenzhen Health Information Big Data Platform, a total of 57,671 patients were screened from 250,788 registered patients with hypertension, of whom 9421 had stroke onset during the 3-year follow-up. In addition to baseline characteristics and historical symptoms, we constructed some trend characteristics from multitemporal medical records. Stratified sampling according to gender ratio and age stratification was implemented to balance the positive and negative cases, and the final 19,953 samples were randomly divided into a training set and test set according to a ratio of 7:3. We used 4 machine learning algorithms for modeling, and the risk prediction performance was compared with the traditional risk scales. We also analyzed the nonlinear effect of continuous characteristics on stroke onset. Results: The tree-based integration algorithm extreme gradient boosting achieved the optimal performance with an area under the receiver operating characteristic curve of 0.9220, surpassing the other 3 traditional machine learning algorithms. Compared with 2 traditional risk scales, the Framingham stroke risk profiles and the Chinese Multiprovincial Cohort Study, our proposed model achieved better performance on the independent validation set, and the area under the receiver operating characteristic value increased by 0.17. Further nonlinear effect analysis revealed the importance of multitemporal trend characteristics in stroke risk prediction, which will benefit the standardized management of hypertensive patients. Conclusions: A high-precision 3-year stroke risk prediction model for hypertensive patients was established, and the model's performance was verified by comparing it with the traditional risk scales. Multitemporal trend characteristics played an important role in stroke onset, and thus the model could be deployed to electronic health record systems to assist in more pervasive, preemptive stroke risk screening, enabling higher efficiency of early disease prevention and intervention. ", doi="10.2196/30277", url="https://medinform.jmir.org/2021/11/e30277", url="http://www.ncbi.nlm.nih.gov/pubmed/34757322" } @Article{info:doi/10.2196/34493, author="Clay, Ieuan and Angelopoulos, Christian and Bailey, Lord Anne and Blocker, Aaron and Carini, Simona and Carvajal, Rodrigo and Drummond, David and McManus, F. Kimberly and Oakley-Girvan, Ingrid and Patel, B. Krupal and Szepietowski, Phillip and Goldsack, C. Jennifer", title="Sensor Data Integration: A New Cross-Industry Collaboration to Articulate Value, Define Needs, and Advance a Framework for Best Practices", journal="J Med Internet Res", year="2021", month="Nov", day="9", volume="23", number="11", pages="e34493", keywords="digital measures", keywords="data integration", keywords="patient centricity", keywords="utility", doi="10.2196/34493", url="https://www.jmir.org/2021/11/e34493", url="http://www.ncbi.nlm.nih.gov/pubmed/34751656" } @Article{info:doi/10.2196/26914, author="Sung, MinDong and Cha, Dongchul and Park, Rang Yu", title="Local Differential Privacy in the Medical Domain to Protect Sensitive Information: Algorithm Development and Real-World Validation", journal="JMIR Med Inform", year="2021", month="Nov", day="8", volume="9", number="11", pages="e26914", keywords="privacy-preserving", keywords="differential privacy", keywords="medical informatics", keywords="medical data", keywords="privacy", keywords="electronic health record", keywords="algorithm", keywords="development", keywords="validation", keywords="big data", keywords="feasibility", keywords="machine learning", keywords="synthetic data", abstract="Background: Privacy is of increasing interest in the present big data era, particularly the privacy of medical data. Specifically, differential privacy has emerged as the standard method for preservation of privacy during data analysis and publishing. Objective: Using machine learning techniques, we applied differential privacy to medical data with diverse parameters and checked the feasibility of our algorithms with synthetic data as well as the balance between data privacy and utility. Methods: All data were normalized to a range between --1 and 1, and the bounded Laplacian method was applied to prevent the generation of out-of-bound values after applying the differential privacy algorithm. To preserve the cardinality of the categorical variables, we performed postprocessing via discretization. The algorithm was evaluated using both synthetic and real-world data (from the eICU Collaborative Research Database). We evaluated the difference between the original data and the perturbated data using misclassification rates and the mean squared error for categorical data and continuous data, respectively. Further, we compared the performance of classification models that predict in-hospital mortality using real-world data. Results: The misclassification rate of categorical variables ranged between 0.49 and 0.85 when the value of $\epsilon$ was 0.1, and it converged to 0 as $\epsilon$ increased. When $\epsilon$ was between 102 and 103, the misclassification rate rapidly dropped to 0. Similarly, the mean squared error of the continuous variables decreased as $\epsilon$ increased. The performance of the model developed from perturbed data converged to that of the model developed from original data as $\epsilon$ increased. In particular, the accuracy of a random forest model developed from the original data was 0.801, and this value ranged from 0.757 to 0.81 when $\epsilon$ was 10-1 and 104, respectively. Conclusions: We applied local differential privacy to medical domain data, which are diverse and high dimensional. Higher noise may offer enhanced privacy, but it simultaneously hinders utility. We should choose an appropriate degree of noise for data perturbation to balance privacy and utility depending on specific situations. ", doi="10.2196/26914", url="https://medinform.jmir.org/2021/11/e26914", url="http://www.ncbi.nlm.nih.gov/pubmed/34747711" } @Article{info:doi/10.2196/30356, author="Wang, Jie and Wang, Xin and Wang, Lei and Peng, Yan", title="Health Information Needs of Young Chinese People Based on an Online Health Community: Topic and Statistical Analysis", journal="JMIR Med Inform", year="2021", month="Nov", day="8", volume="9", number="11", pages="e30356", keywords="information needs", keywords="young people", keywords="online health community", keywords="topic analysis", abstract="Background: The internet has been widely accessible and well accepted by young people; however, there is a limited understanding of the internet usage patterns and characteristics on issues related to health problems. The contents posted on online health communities (OHCs) are valuable resources to learn about youth's health information needs. Objective: In this study, we concurrently exploited statistical analysis and topic analysis of online health information needs to explore the distribution, impact factors, and topics of interest relevant to Chinese young people. Methods: We collected 60,478 health-related data sets posted by young people from a well-known Chinese OHC named xywy.com. Descriptive statistical analysis and correlation analysis were applied to find the distribution and influence factors of the information needs of Chinese young people. Furthermore, a general 4-step topic mining strategy was presented for sparse short texts, which included sentence vectorization, dimension reduction, clustering, and keyword generation. Results: In the Chinese OHC, Chinese young people had a high demand for information in the areas of gynecology and obstetrics, internal medicine, dermatology, plastic surgery, and surgery, and they focused on topics such as treatment, symptoms, causes, pathology, and diet. Females accounted for 69.67\% (42,136/60,478) and young adults accounted for 87.44\% (52,882/60,478) of all data. Gender, age, and disease type all had a significant effect on young people's information needs and topic preferences (P<.001). Conclusions: We conducted comprehensive analyses to discover the online health information needs of Chinese young people. The research findings are of great practical value to carry out health education and health knowledge dissemination inside and outside of schools according to the interests of youth, enable the innovation of information services in OHCs, and improve the health literacy of young people. ", doi="10.2196/30356", url="https://medinform.jmir.org/2021/11/e30356", url="http://www.ncbi.nlm.nih.gov/pubmed/34747707" } @Article{info:doi/10.2196/26426, author="Sung, MinDong and Hahn, Sangchul and Han, Hoon Chang and Lee, Mo Jung and Lee, Jayoung and Yoo, Jinkyu and Heo, Jay and Kim, Sam Young and Chung, Soo Kyung", title="Event Prediction Model Considering Time and Input Error Using Electronic Medical Records in the Intensive Care Unit: Retrospective Study", journal="JMIR Med Inform", year="2021", month="Nov", day="4", volume="9", number="11", pages="e26426", keywords="machine learning", keywords="critical care", keywords="prediction model", keywords="intensive care unit", keywords="mortality", keywords="AKI", keywords="sepsis", abstract="Background: In the era of artificial intelligence, event prediction models are abundant. However, considering the limitation of the electronic medical record--based model, including the temporally skewed prediction and the record itself, these models could be delayed or could yield errors. Objective: In this study, we aim to develop multiple event prediction models in intensive care units to overcome their temporal skewness and evaluate their robustness against delayed and erroneous input. Methods: A total of 21,738 patients were included in the development cohort. Three events---death, sepsis, and acute kidney injury---were predicted. To overcome the temporal skewness, we developed three models for each event, which predicted the events in advance of three prespecified timepoints. Additionally, to evaluate the robustness against input error and delays, we added simulated errors and delayed input and calculated changes in the area under the receiver operating characteristic curve (AUROC) values. Results: Most of the AUROC and area under the precision-recall curve values of each model were higher than those of the conventional scores, as well as other machine learning models previously used. In the error input experiment, except for our proposed model, an increase in the noise added to the model lowered the resulting AUROC value. However, the delayed input did not show the performance decreased in this experiment. Conclusions: For a prediction model that was applicable in the real world, we considered not only performance but also temporal skewness, delayed input, and input error. ", doi="10.2196/26426", url="https://medinform.jmir.org/2021/11/e26426", url="http://www.ncbi.nlm.nih.gov/pubmed/34734837" } @Article{info:doi/10.2196/28763, author="Teramoto, Kei and Takeda, Toshihiro and Mihara, Naoki and Shimai, Yoshie and Manabe, Shirou and Kuwata, Shigeki and Kondoh, Hiroshi and Matsumura, Yasushi", title="Detecting Adverse Drug Events Through the Chronological Relationship Between the Medication Period and the Presence of Adverse Reactions From Electronic Medical Record Systems: Observational Study", journal="JMIR Med Inform", year="2021", month="Nov", day="1", volume="9", number="11", pages="e28763", keywords="real world data", keywords="electronic medical record", keywords="adverse drug event", abstract="Background: Medicines may cause various adverse reactions. An enormous amount of money and effort is spent investigating adverse drug events (ADEs) in clinical trials and postmarketing surveillance. Real-world data from multiple electronic medical records (EMRs) can make it easy to understand the ADEs that occur in actual patients. Objective: In this study, we generated a patient medication history database from physician orders recorded in EMRs, which allowed the period of medication to be clearly identified. Methods: We developed a method for detecting ADEs based on the chronological relationship between the presence of an adverse event and the medication period. To verify our method, we detected ADEs with alanine aminotransferase elevation in patients receiving aspirin, clopidogrel, and ticlopidine. The accuracy of the detection was evaluated with a chart review and by comparison with the Roussel Uclaf Causality Assessment Method (RUCAM), which is a standard method for detecting drug-induced liver injury. Results: The calculated rates of ADE with ALT elevation in patients receiving aspirin, clopidogrel, and ticlopidine were 3.33\% (868/26,059 patients), 3.70\% (188/5076 patients), and 5.69\% (226/3974 patients), respectively, which were in line with the rates of previous reports. We reviewed the medical records of the patients in whom ADEs were detected. Our method accurately predicted ADEs in 90\% (27/30patients) treated with aspirin, 100\% (9/9 patients) treated with clopidogrel, and 100\% (4/4 patients) treated with ticlopidine. Only 3 ADEs that were detected by the RUCAM were not detected by our method. Conclusions: These findings demonstrate that the present method is effective for detecting ADEs based on EMR data. ", doi="10.2196/28763", url="https://medinform.jmir.org/2021/11/e28763", url="http://www.ncbi.nlm.nih.gov/pubmed/33993103" } @Article{info:doi/10.2196/31294, author="Nunes Vilaza, Giovanna and Coyle, David and Bardram, Eyvind Jakob", title="Public Attitudes to Digital Health Research Repositories: Cross-sectional International Survey", journal="J Med Internet Res", year="2021", month="Oct", day="29", volume="23", number="10", pages="e31294", keywords="digital medicine", keywords="health informatics", keywords="health data repositories", keywords="personal sensing", keywords="technology acceptance", keywords="willingness to share data", keywords="human-centered computing", keywords="ethics", abstract="Background: Digital health research repositories propose sharing longitudinal streams of health records and personal sensing data between multiple projects and researchers. Motivated by the prospect of personalizing patient care (precision medicine), these initiatives demand broad public acceptance and large numbers of data contributors, both of which are challenging. Objective: This study investigates public attitudes toward possibly contributing to digital health research repositories to identify factors for their acceptance and to inform future developments. Methods: A cross-sectional online survey was conducted from March 2020 to December 2020. Because of the funded project scope and a multicenter collaboration, study recruitment targeted young adults in Denmark and Brazil, allowing an analysis of the differences between 2 very contrasting national contexts. Through closed-ended questions, the survey examined participants' willingness to share different data types, data access preferences, reasons for concern, and motivations to contribute. The survey also collected information about participants' demographics, level of interest in health topics, previous participation in health research, awareness of examples of existing research data repositories, and current attitudes about digital health research repositories. Data analysis consisted of descriptive frequency measures and statistical inferences (bivariate associations and logistic regressions). Results: The sample comprises 1017 respondents living in Brazil (1017/1600, 63.56\%) and 583 in Denmark (583/1600, 36.44\%). The demographics do not differ substantially between participants of these countries. The majority is aged between 18 and 27 years (933/1600, 58.31\%), is highly educated (992/1600, 62.00\%), uses smartphones (1562/1600, 97.63\%), and is in good health (1407/1600, 87.94\%). The analysis shows a vast majority were very motivated by helping future patients (1366/1600, 85.38\%) and researchers (1253/1600, 78.31\%), yet very concerned about unethical projects (1219/1600, 76.19\%), profit making without consent (1096/1600, 68.50\%), and cyberattacks (1055/1600, 65.94\%). Participants' willingness to share data is lower when sharing personal sensing data, such as the content of calls and texts (1206/1600, 75.38\%), in contrast to more traditional health research information. Only 13.44\% (215/1600) find it desirable to grant data access to private companies, and most would like to stay informed about which projects use their data (1334/1600, 83.38\%) and control future data access (1181/1600, 73.81\%). Findings indicate that favorable attitudes toward digital health research repositories are related to a personal interest in health topics (odds ratio [OR] 1.49, 95\% CI 1.10-2.02; P=.01), previous participation in health research studies (OR 1.70, 95\% CI 1.24-2.35; P=.001), and awareness of examples of research repositories (OR 2.78, 95\% CI 1.83-4.38; P<.001). Conclusions: This study reveals essential factors for acceptance and willingness to share personal data with digital health research repositories. Implications include the importance of being more transparent about the goals and beneficiaries of research projects using and re-using data from repositories, providing participants with greater autonomy for choosing who gets access to which parts of their data, and raising public awareness of the benefits of data sharing for research. In addition, future developments should engage with and reduce risks for those unwilling to participate. ", doi="10.2196/31294", url="https://www.jmir.org/2021/10/e31294", url="http://www.ncbi.nlm.nih.gov/pubmed/34714253" } @Article{info:doi/10.2196/29820, author="Monzani, Dario and Vergani, Laura and Pizzoli, Maria Silvia Francesca and Marton, Giulia and Pravettoni, Gabriella", title="Emotional Tone, Analytical Thinking, and Somatosensory Processes of a Sample of Italian Tweets During the First Phases of the COVID-19 Pandemic: Observational Study", journal="J Med Internet Res", year="2021", month="Oct", day="27", volume="23", number="10", pages="e29820", keywords="internet", keywords="mHealth", keywords="infodemiology", keywords="infoveillance", keywords="pandemic", keywords="public health", keywords="COVID-19", keywords="Twitter", keywords="psycholinguistic analysis", keywords="trauma", abstract="Background: The COVID-19 pandemic is a traumatic individual and collective chronic experience, with tremendous consequences on mental and psychological health that can also be reflected in people's use of words. Psycholinguistic analysis of tweets from Twitter allows obtaining information about people's emotional expression, analytical thinking, and somatosensory processes, which are particularly important in traumatic events contexts. Objective: We aimed to analyze the influence of official Italian COVID-19 daily data (new cases, deaths, and hospital discharges) and the phase of managing the pandemic on how people expressed emotions and their analytical thinking and somatosensory processes in Italian tweets written during the first phases of the COVID-19 pandemic in Italy. Methods: We retrieved 1,697,490 Italian COVID-19--related tweets written from February 24, 2020 to June 14, 2020 and analyzed them using LIWC2015 to calculate 3 summary psycholinguistic variables: emotional tone, analytical thinking, and somatosensory processes. Official daily data about new COVID-19 cases, deaths, and hospital discharges were retrieved from the Italian Prime Minister's Office and Civil Protection Department GitHub page. We considered 3 phases of managing the COVID-19 pandemic in Italy. We performed 3 general models, 1 for each summary variable as the dependent variable and with daily data and phase of managing the pandemic as independent variables. Results: General linear models to assess differences in daily scores of emotional tone, analytical thinking, and somatosensory processes were significant (F6,104=21.53, P<.001, R2= .55; F5,105=9.20, P<.001, R2= .30; F6,104=6.15, P<.001, R2=.26, respectively). Conclusions: The COVID-19 pandemic affects how people express emotions, analytical thinking, and somatosensory processes in tweets. Our study contributes to the investigation of pandemic psychological consequences through psycholinguistic analysis of social media textual data. ", doi="10.2196/29820", url="https://www.jmir.org/2021/10/e29820", url="http://www.ncbi.nlm.nih.gov/pubmed/34516386" } @Article{info:doi/10.2196/29584, author="Kummervold, E. Per and Martin, Sam and Dada, Sara and Kilich, Eliz and Denny, Chermain and Paterson, Pauline and Larson, J. Heidi", title="Categorizing Vaccine Confidence With a Transformer-Based Machine Learning Model: Analysis of Nuances of Vaccine Sentiment in Twitter Discourse", journal="JMIR Med Inform", year="2021", month="Oct", day="8", volume="9", number="10", pages="e29584", keywords="computer science", keywords="information technology", keywords="public health", keywords="health humanities", keywords="vaccines", keywords="machine learning", abstract="Background: Social media has become an established platform for individuals to discuss and debate various subjects, including vaccination. With growing conversations on the web and less than desired maternal vaccination uptake rates, these conversations could provide useful insights to inform future interventions. However, owing to the volume of web-based posts, manual annotation and analysis are difficult and time consuming. Automated processes for this type of analysis, such as natural language processing, have faced challenges in extracting complex stances such as attitudes toward vaccination from large amounts of text. Objective: The aim of this study is to build upon recent advances in transposer-based machine learning methods and test whether transformer-based machine learning could be used as a tool to assess the stance expressed in social media posts toward vaccination during pregnancy. Methods: A total of 16,604 tweets posted between November 1, 2018, and April 30, 2019, were selected using keyword searches related to maternal vaccination. After excluding irrelevant tweets, the remaining tweets were coded by 3 individual researchers into the categories Promotional, Discouraging, Ambiguous, and Neutral or No Stance. After creating a final data set of 2722 unique tweets, multiple machine learning techniques were trained on a part of this data set and then tested and compared with the human annotators. Results: We found the accuracy of the machine learning techniques to be 81.8\% (F score=0.78) compared with the agreed score among the 3 annotators. For comparison, the accuracies of the individual annotators compared with the final score were 83.3\%, 77.9\%, and 77.5\%. Conclusions: This study demonstrates that we are able to achieve close to the same accuracy in categorizing tweets using our machine learning models as could be expected from a single human coder. The potential to use this automated process, which is reliable and accurate, could free valuable time and resources for conducting this analysis, in addition to informing potentially effective and necessary interventions. ", doi="10.2196/29584", url="https://medinform.jmir.org/2021/10/e29584", url="http://www.ncbi.nlm.nih.gov/pubmed/34623312" } @Article{info:doi/10.2196/30697, author="Foraker, Randi and Guo, Aixia and Thomas, Jason and Zamstein, Noa and Payne, RO Philip and Wilcox, Adam and ", title="The National COVID Cohort Collaborative: Analyses of Original and Computationally Derived Electronic Health Record Data", journal="J Med Internet Res", year="2021", month="Oct", day="4", volume="23", number="10", pages="e30697", keywords="synthetic data", keywords="protected health information", keywords="COVID-19", keywords="electronic health records and systems", keywords="data analysis", abstract="Background: Computationally derived (``synthetic'') data can enable the creation and analysis of clinical, laboratory, and diagnostic data as if they were the original electronic health record data. Synthetic data can support data sharing to answer critical research questions to address the COVID-19 pandemic. Objective: We aim to compare the results from analyses of synthetic data to those from original data and assess the strengths and limitations of leveraging computationally derived data for research purposes. Methods: We used the National COVID Cohort Collaborative's instance of MDClone, a big data platform with data-synthesizing capabilities (MDClone Ltd). We downloaded electronic health record data from 34 National COVID Cohort Collaborative institutional partners and tested three use cases, including (1) exploring the distributions of key features of the COVID-19--positive cohort; (2) training and testing predictive models for assessing the risk of admission among these patients; and (3) determining geospatial and temporal COVID-19--related measures and outcomes, and constructing their epidemic curves. We compared the results from synthetic data to those from original data using traditional statistics, machine learning approaches, and temporal and spatial representations of the data. Results: For each use case, the results of the synthetic data analyses successfully mimicked those of the original data such that the distributions of the data were similar and the predictive models demonstrated comparable performance. Although the synthetic and original data yielded overall nearly the same results, there were exceptions that included an odds ratio on either side of the null in multivariable analyses (0.97 vs 1.01) and differences in the magnitude of epidemic curves constructed for zip codes with low population counts. Conclusions: This paper presents the results of each use case and outlines key considerations for the use of synthetic data, examining their role in collaborative research for faster insights. ", doi="10.2196/30697", url="https://www.jmir.org/2021/10/e30697", url="http://www.ncbi.nlm.nih.gov/pubmed/34559671" } @Article{info:doi/10.2196/15739, author="Daniels, Helen and Jones, Helen Kerina and Heys, Sharon and Ford, Vincent David", title="Exploring the Use of Genomic and Routinely Collected Data: Narrative Literature Review and Interview Study", journal="J Med Internet Res", year="2021", month="Sep", day="24", volume="23", number="9", pages="e15739", keywords="genomic data", keywords="routine data", keywords="electronic health records", keywords="health data science", keywords="genome", keywords="data regulation", keywords="case study", keywords="eHealth", abstract="Background: Advancing the use of genomic data with routinely collected health data holds great promise for health care and research. Increasing the use of these data is a high priority to understand and address the causes of disease. Objective: This study aims to provide an outline of the use of genomic data alongside routinely collected data in health research to date. As this field prepares to move forward, it is important to take stock of the current state of play in order to highlight new avenues for development, identify challenges, and ensure that adequate data governance models are in place for safe and socially acceptable progress. Methods: We conducted a literature review to draw information from past studies that have used genomic and routinely collected data and conducted interviews with individuals who use these data for health research. We collected data on the following: the rationale of using genomic data in conjunction with routinely collected data, types of genomic and routinely collected data used, data sources, project approvals, governance and access models, and challenges encountered. Results: The main purpose of using genomic and routinely collected data was to conduct genome-wide and phenome-wide association studies. Routine data sources included electronic health records, disease and death registries, health insurance systems, and deprivation indices. The types of genomic data included polygenic risk scores, single nucleotide polymorphisms, and measures of genetic activity, and biobanks generally provided these data. Although the literature search showed that biobanks released data to researchers, the case studies revealed a growing tendency for use within a data safe haven. Challenges of working with these data revolved around data collection, data storage, technical, and data privacy issues. Conclusions: Using genomic and routinely collected data holds great promise for progressing health research. Several challenges are involved, particularly in terms of privacy. Overcoming these barriers will ensure that the use of these data to progress health research can be exploited to its full potential. ", doi="10.2196/15739", url="https://www.jmir.org/2021/9/e15739", url="http://www.ncbi.nlm.nih.gov/pubmed/34559060" } @Article{info:doi/10.2196/28635, author="Huemer, Matthias and Jahn-Kuch, Daniela and Hofmann, Guenter and Andritsch, Elisabeth and Farkas, Clemens and Schaupp, Walter and Masel, Katharina Eva and Jost, J. Philipp and Pichler, Martin", title="Trends and Patterns in the Public Awareness of Palliative Care, Euthanasia, and End-of-Life Decisions in 3 Central European Countries Using Big Data Analysis From Google: Retrospective Analysis", journal="J Med Internet Res", year="2021", month="Sep", day="20", volume="23", number="9", pages="e28635", keywords="Google Trends", keywords="end-of-life decisions", keywords="assisted suicide", keywords="euthanasia", keywords="palliative care", keywords="health care policy", abstract="Background: End-of-life decisions, specifically the provision of euthanasia and assisted suicide services, challenge traditional medical and ethical principles. Austria and Germany have decided to liberalize their laws restricting assisted suicide, thus reigniting the debate about a meaningful framework in which the practice should be embedded. Evidence of the relevance of assisted suicide and euthanasia for the general population in Germany and Austria is limited. Objective: The aim of this study is to examine whether the public awareness documented by search activities in the most frequently used search engine, Google, on the topics of palliative care, euthanasia, and advance health care directives changed with the implementation of palliative care services and new governmental regulations concerning end-of-life decisions. Methods: We searched for policies, laws, and regulations promulgated or amended in Austria, Germany, and Switzerland between 2004 and 2020 and extracted data on the search volume for each search term topic from Google Trends as a surrogate of public awareness and interest. Annual averages were analyzed using the Joinpoint Regression Program. Results: Important policy changes yielded significant changes in search trends for the investigated topics. The enactment of laws regulating advance health care directives coincided with a significant drop in the volume of searches for the topic of euthanasia in all 3 countries (Austria: ?24.48\%, P=.02; Germany: ?14.95\%, P<.001; Switzerland: ?11.75\%, P=.049). Interest in palliative care increased with the availability of care services and the implementation of laws and policies to promote palliative care (Austria: 22.69\%, P=.01; Germany: 14.39, P<.001; Switzerland: 17.59\%, P<.001). The search trends for advance health care directives showed mixed results. While interest remained steady in Austria within the study period, it increased by 3.66\% (P<.001) in Switzerland and decreased by 2.85\% (P<.001) in Germany. Conclusions: Our results demonstrate that legal measures securing patients' autonomy at the end of life may lower the search activities for topics related to euthanasia and assisted suicide. Palliative care may be a meaningful way to raise awareness of the different options for end-of-life care and to guide patients in their decision-making process regarding the same. ", doi="10.2196/28635", url="https://www.jmir.org/2021/9/e28635", url="http://www.ncbi.nlm.nih.gov/pubmed/34542419" } @Article{info:doi/10.2196/21810, author="Alaqra, Sarah Ala and Kane, Bridget and Fischer-H{\"u}bner, Simone", title="Machine Learning--Based Analysis of Encrypted Medical Data in the Cloud: Qualitative Study of Expert Stakeholders' Perspectives", journal="JMIR Hum Factors", year="2021", month="Sep", day="16", volume="8", number="3", pages="e21810", keywords="medical data analysis", keywords="encryption", keywords="privacy-enhancing technologies", keywords="machine learning", keywords="stakeholders", keywords="tradeoffs", keywords="perspectives", keywords="eHealth", keywords="interviews", abstract="Background: Third-party cloud-based data analysis applications are proliferating in electronic health (eHealth) because of the expertise offered and their monetary advantage. However, privacy and security are critical concerns when handling sensitive medical data in the cloud. Technical advances based on ``crypto magic'' in privacy-preserving machine learning (ML) enable data analysis in encrypted form for maintaining confidentiality. Such privacy-enhancing technologies (PETs) could be counterintuitive to relevant stakeholders in eHealth, which could in turn hinder adoption; thus, more attention is needed on human factors for establishing trust and transparency. Objective: The aim of this study was to analyze eHealth expert stakeholders' perspectives and the perceived tradeoffs in regard to data analysis on encrypted medical data in the cloud, and to derive user requirements for development of a privacy-preserving data analysis tool. Methods: We used semistructured interviews and report on 14 interviews with individuals having medical, technical, or research expertise in eHealth. We used thematic analysis for analyzing interview data. In addition, we conducted a workshop for eliciting requirements. Results: Our results show differences in the understanding of and in trusting the technology; caution is advised by technical experts, whereas patient safety assurances are required by medical experts. Themes were identified with general perspectives on data privacy and practices (eg, acceptance of using external services), as well as themes highlighting specific perspectives (eg, data protection drawbacks and concerns of the data analysis on encrypted data). The latter themes result in requiring assurances and conformance testing for trusting tools such as the proposed ML-based tool. Communicating privacy, and utility benefits and tradeoffs with stakeholders is essential for trust. Furthermore, stakeholders and their organizations share accountability of patient data. Finally, stakeholders stressed the importance of informing patients about the privacy of their data. Conclusions: Understanding the benefits and risks of using eHealth PETs is crucial, and collaboration among diverse stakeholders is essential. Assurances of the tool's privacy, accuracy, and patient safety should be in place for establishing trust of ML-based PETs, especially if used in the cloud. ", doi="10.2196/21810", url="https://humanfactors.jmir.org/2021/3/e21810", url="http://www.ncbi.nlm.nih.gov/pubmed/34528892" } @Article{info:doi/10.2196/29622, author="Lopez Segui, Francesc and Hernandez Guillamet, Guillem and Pifarr{\'e} Arolas, H{\'e}ctor and Marin-Gomez, X. Francesc and Ruiz Comellas, Anna and Ramirez Morros, Maria Anna and Adroher Mas, Cristina and Vidal-Alaball, Josep", title="Characterization and Identification of Variations in Types of Primary Care Visits Before and During the COVID-19 Pandemic in Catalonia: Big Data Analysis Study", journal="J Med Internet Res", year="2021", month="Sep", day="14", volume="23", number="9", pages="e29622", keywords="COVID-19", keywords="primary care", keywords="diagnose variations", keywords="big data", keywords="ICD10", keywords="health system", keywords="healthcare system", abstract="Background: The COVID-19 pandemic has turned the care model of health systems around the world upside down, causing the abrupt cancellation of face-to-face visits and redirection of the model toward telemedicine. Digital transformation boosts information systems---the more robust they are, the easier it is to monitor the health care system in a highly complex state and allow for more agile and reliable analysis. Objective: The purpose of this study was to analyze diagnoses from primary care visits and distinguish between those that had higher and lower variations, relative to the 2019 and 2020 periods (roughly before and during COVID-19), to identify clinical profiles that may have been most impaired from the least-used diagnostic codes for visits during the pandemic. Methods: We used a database from the Primary Care Services Information Technologies Information System of Catalonia. We analyzed the register of visits (n=2,824,185) and their International Classification of Diseases (ICD-10) diagnostic codes (n=3,921,974; mean 1.38 per visit), as approximations of the reasons for consultations, at 3 different grouping levels. The data were represented by a term frequency matrix and analyzed recursively in different partitions aggregated according to date. Results: The increase in non--face-to-face visits (+267\%) did not counterbalance the decrease in face-to-face visits (--47\%), with an overall reduction in the total number of visits of 1.36\%, despite the notable increase in nursing visits (10.54\%). The largest increases in 2020 were visits with diagnoses related to COVID-19 (ICD-10 codes Z20-Z29: 2.540\%), along with codes related to economic and housing problems (ICD-10 codes Z55-Z65: 44.40\%). Visits with most of the other diagnostic codes decreased in 2020 relative to those in 2019. The largest reductions were chronic pathologies such as arterial hypertension (ICD-10 codes I10-I16: --32.73\%) or diabetes (ICD-10 codes E08-E13: --21.13\%), but also obesity (E65-E68: --48.58\%) and bodily injuries (ICD-10 code T14: --33.70\%). Visits with mental health--related diagnostic codes decreased, but the decrease was less than the average decrease. There was a decrease in consultations---for children, adolescents, and adults---for respiratory infections (ICD-10 codes J00-J06: --40.96\%). The results show large year-on-year variations (in absolute terms, an average of 12\%), which is representative of the strong shock to the health system. Conclusions: The disruption in the primary care model in Catalonia has led to an explosive increase in the number of non--face-to-face visits. There has been a reduction in the number of visits for diagnoses related to chronic pathologies, respiratory infections, obesity, and bodily injuries. Instead, visits for diagnoses related to socioeconomic and housing problems have increased, which emphasizes the importance of social determinants of health in the context of this pandemic. Big data analytics with routine care data yield findings that are consistent with those derived from intuition in everyday clinical practice and can help inform decision making by health planners in order to use the next few years to focus on the least-treated diseases during the COVID-19 pandemic. ", doi="10.2196/29622", url="https://www.jmir.org/2021/9/e29622", url="http://www.ncbi.nlm.nih.gov/pubmed/34313600" } @Article{info:doi/10.2196/29310, author="Patterson, Rees Jenny and Shaw, Donna and Thomas, R. Sharita and Hayes, A. Julie and Daley, R. Christopher and Knight, Stefania and Aikat, Jay and Mieczkowska, O. Joanna and Ahalt, C. Stanley and Krishnamurthy, K. Ashok", title="COVID-19 Data Utilization in North Carolina: Qualitative Analysis of Stakeholder Experiences", journal="JMIR Public Health Surveill", year="2021", month="Sep", day="2", volume="7", number="9", pages="e29310", keywords="qualitative research", keywords="interview", keywords="COVID-19", keywords="SARS-CoV-2", keywords="pandemic", keywords="data collection", keywords="data reporting", keywords="data", keywords="public health", keywords="coronavirus disease 2019", abstract="Background: As the world faced the pandemic caused by the novel coronavirus disease 2019 (COVID-19), medical professionals, technologists, community leaders, and policy makers sought to understand how best to leverage data for public health surveillance and community education. With this complex public health problem, North Carolinians relied on data from state, federal, and global health organizations to increase their understanding of the pandemic and guide decision-making. Objective: We aimed to describe the role that stakeholders involved in COVID-19--related data played in managing the pandemic in North Carolina. The study investigated the processes used by organizations throughout the state in using, collecting, and reporting COVID-19 data. Methods: We used an exploratory qualitative study design to investigate North Carolina's COVID-19 data collection efforts. To better understand these processes, key informant interviews were conducted with employees from organizations that collected COVID-19 data across the state. We developed an interview guide, and open-ended semistructured interviews were conducted during the period from June through November 2020. Interviews lasted between 30 and 45 minutes and were conducted by data scientists by videoconference. Data were subsequently analyzed using qualitative data analysis software. Results: Results indicated that electronic health records were primary sources of COVID-19 data. Often, data were also used to create dashboards to inform the public or other health professionals, to aid in decision-making, or for reporting purposes. Cross-sector collaboration was cited as a major success. Consistency among metrics and data definitions, data collection processes, and contact tracing were cited as challenges. Conclusions: Findings suggest that, during future outbreaks, organizations across regions could benefit from data centralization and data governance. Data should be publicly accessible and in a user-friendly format. Additionally, established cross-sector collaboration networks are demonstrably beneficial for public health professionals across the state as these established relationships facilitate a rapid response to evolving public health challenges. ", doi="10.2196/29310", url="https://publichealth.jmir.org/2021/9/e29310", url="http://www.ncbi.nlm.nih.gov/pubmed/34298500" } @Article{info:doi/10.2196/24079, author="Ghanad Poor, Niema and West, C. Nicholas and Sreepada, Syamala Rama and Murthy, Srinivas and G{\"o}rges, Matthias", title="An Artificial Neural Network--Based Pediatric Mortality Risk Score: Development and Performance Evaluation Using Data From a Large North American Registry", journal="JMIR Med Inform", year="2021", month="Aug", day="31", volume="9", number="8", pages="e24079", keywords="artificial intelligence", keywords="risk assessment", keywords="decision support techniques", keywords="intensive care unit", keywords="pediatric", keywords="decision making", keywords="computer-assisted", abstract="Background: In the pediatric intensive care unit (PICU), quantifying illness severity can be guided by risk models to enable timely identification and appropriate intervention. Logistic regression models, including the pediatric index of mortality 2 (PIM-2) and pediatric risk of mortality III (PRISM-III), produce a mortality risk score using data that are routinely available at PICU admission. Artificial neural networks (ANNs) outperform regression models in some medical fields. Objective: In light of this potential, we aim to examine ANN performance, compared to that of logistic regression, for mortality risk estimation in the PICU. Methods: The analyzed data set included patients from North American PICUs whose discharge diagnostic codes indicated evidence of infection and included the data used for the PIM-2 and PRISM-III calculations and their corresponding scores. We stratified the data set into training and test sets, with approximately equal mortality rates, in an effort to replicate real-world data. Data preprocessing included imputing missing data through simple substitution and normalizing data into binary variables using PRISM-III thresholds. A 2-layer ANN model was built to predict pediatric mortality, along with a simple logistic regression model for comparison. Both models used the same features required by PIM-2 and PRISM-III. Alternative ANN models using single-layer or unnormalized data were also evaluated. Model performance was compared using the area under the receiver operating characteristic curve (AUROC) and the area under the precision recall curve (AUPRC) and their empirical 95\% CIs. Results: Data from 102,945 patients (including 4068 deaths) were included in the analysis. The highest performing ANN (AUROC 0.871, 95\% CI 0.862-0.880; AUPRC 0.372, 95\% CI 0.345-0.396) that used normalized data performed better than PIM-2 (AUROC 0.805, 95\% CI 0.801-0.816; AUPRC 0.234, 95\% CI 0.213-0.255) and PRISM-III (AUROC 0.844, 95\% CI 0.841-0.855; AUPRC 0.348, 95\% CI 0.322-0.367). The performance of this ANN was also significantly better than that of the logistic regression model (AUROC 0.862, 95\% CI 0.852-0.872; AUPRC 0.329, 95\% CI 0.304-0.351). The performance of the ANN that used unnormalized data (AUROC 0.865, 95\% CI 0.856-0.874) was slightly inferior to our highest performing ANN; the single-layer ANN architecture performed poorly and was not investigated further. Conclusions: A simple ANN model performed slightly better than the benchmark PIM-2 and PRISM-III scores and a traditional logistic regression model trained on the same data set. The small performance gains achieved by this two-layer ANN model may not offer clinically significant improvement; however, further research with other or more sophisticated model designs and better imputation of missing data may be warranted. ", doi="10.2196/24079", url="https://medinform.jmir.org/2021/8/e24079", url="http://www.ncbi.nlm.nih.gov/pubmed/34463636" } @Article{info:doi/10.2196/27681, author="Zhu, Peng Yu and Park, Woo Han", title="Development of a COVID-19 Web Information Transmission Structure Based on a Quadruple Helix Model: Webometric Network Approach Using Bing", journal="J Med Internet Res", year="2021", month="Aug", day="26", volume="23", number="8", pages="e27681", keywords="quadruple helix model", keywords="COVID-19", keywords="structural analysis", keywords="content analysis", keywords="network analysis", keywords="public health", keywords="webometrics", keywords="infodemiology", keywords="infoveillance", keywords="development", keywords="internet", keywords="online health information", keywords="structure", keywords="communication", keywords="big data", abstract="Background: Developing an understanding of the social structure and phenomenon of pandemic information sources worldwide is immensely significant. Objective: Based on the quadruple helix model, the aim of this study was to construct and analyze the structure and content of the internet information sources regarding the COVID-19 pandemic, considering time and space. The broader goal was to determine the status and limitations of web information transmission and online communication structure during public health emergencies. Methods: By sorting the second top-level domain, we divided the structure of network information sources into four levels: government, educational organizations, companies, and nonprofit organizations. We analyzed the structure of information sources and the evolution of information content at each stage using quadruple helix and network analysis methods. Results: The results of the structural analysis indicated that the online sources of information in Asia were more diverse than those in other regions in February 2020. As the pandemic spread in April, the information sources in non-Asian regions began to diversify, and the information source structure diversified further in July. With the spread of the pandemic, for an increasing number of countries, not only the government authorities of high concern but also commercial and educational organizations began to produce and provide significant amounts of information and advice. Nonprofit organizations also produced information, but to a lesser extent. The impact of the virus spread from the initial public level of the government to many levels within society. After April, the government's role in the COVID-19 network information was central. The results of the content analysis showed that there was an increased focus on discussion regarding public health--related campaign materials at all stages. The information content changed with the changing stages. In the early stages, the basic situation regarding the virus and its impact on health attracted most of the attention. Later, the content was more focused on prevention. The business and policy environment also changed from the beginning of the pandemic, and the social changes caused by the pandemic became a popular discussion topic. Conclusions: For public health emergencies, some online and offline information sources may not be sufficient. Diversified institutions must pay attention to public health emergencies and actively respond to multihelical information sources. In terms of published messages, the educational sector plays an important role in public health events. However, educational institutions release less information than governments and businesses. This study proposes that the quadruple helix not only has research significance in the field of scientific cooperation but could also be used to perform effective research regarding web information during crises. This is significant for further development of the quadruple helix model in the medical internet research area. ", doi="10.2196/27681", url="https://www.jmir.org/2021/8/e27681", url="http://www.ncbi.nlm.nih.gov/pubmed/34280119" } @Article{info:doi/10.2196/24762, author="Yang, Hyun-Lim and Jung, Chul-Woo and Yang, Mi Seong and Kim, Min-Soo and Shim, Sungho and Lee, Hyun Kook and Lee, Hyung-Chul", title="Development and Validation of an Arterial Pressure-Based Cardiac Output Algorithm Using a Convolutional Neural Network: Retrospective Study Based on Prospective Registry Data", journal="JMIR Med Inform", year="2021", month="Aug", day="16", volume="9", number="8", pages="e24762", keywords="cardiac output", keywords="deep learning", keywords="arterial pressure", abstract="Background: Arterial pressure-based cardiac output (APCO) is a less invasive method for estimating cardiac output without concerns about complications from the pulmonary artery catheter (PAC). However, inaccuracies of currently available APCO devices have been reported. Improvements to the algorithm by researchers are impossible, as only a subset of the algorithm has been released. Objective: In this study, an open-source algorithm was developed and validated using a convolutional neural network and a transfer learning technique. Methods: A retrospective study was performed using data from a prospective cohort registry of intraoperative bio-signal data from a university hospital. The convolutional neural network model was trained using the arterial pressure waveform as input and the stroke volume (SV) value as the output. The model parameters were pretrained using the SV values from a commercial APCO device (Vigileo or EV1000 with the FloTrac algorithm) and adjusted with a transfer learning technique using SV values from the PAC. The performance of the model was evaluated using absolute error for the PAC on the testing dataset from separate periods. Finally, we compared the performance of the deep learning model and the FloTrac with the SV values from the PAC. Results: A total of 2057 surgical cases (1958 training and 99 testing cases) were used in the registry. In the deep learning model, the absolute errors of SV were 14.5 (SD 13.4) mL (10.2 [SD 8.4] mL in cardiac surgery and 17.4 [SD 15.3] mL in liver transplantation). Compared with FloTrac, the absolute errors of the deep learning model were significantly smaller (16.5 [SD 15.4] and 18.3 [SD 15.1], P<.001). Conclusions: The deep learning--based APCO algorithm showed better performance than the commercial APCO device. Further improvement of the algorithm developed in this study may be helpful for estimating cardiac output accurately in clinical practice and optimizing high-risk patient care. ", doi="10.2196/24762", url="https://medinform.jmir.org/2021/8/e24762", url="http://www.ncbi.nlm.nih.gov/pubmed/34398790" } @Article{info:doi/10.2196/27017, author="Bright, A. Roselie and Rankin, K. Summer and Dowdy, Katherine and Blok, V. Sergey and Bright, J. Susan and Palmer, M. Lee Anne", title="Finding Potential Adverse Events in the Unstructured Text of Electronic Health Care Records: Development of the Shakespeare Method", journal="JMIRx Med", year="2021", month="Aug", day="11", volume="2", number="3", pages="e27017", keywords="epidemiology", keywords="electronic health record", keywords="electronic health care record", keywords="big data", keywords="patient harm", keywords="patient safety", keywords="public health", keywords="product surveillance, postmarketing", keywords="natural language processing", keywords="proof-of-concept study", keywords="critical care", abstract="Background: Big data tools provide opportunities to monitor adverse events (patient harm associated with medical care) (AEs) in the unstructured text of electronic health care records (EHRs). Writers may explicitly state an apparent association between treatment and adverse outcome (``attributed'') or state the simple treatment and outcome without an association (``unattributed''). Many methods for finding AEs in text rely on predefining possible AEs before searching for prespecified words and phrases or manual labeling (standardization) by investigators. We developed a method to identify possible AEs, even if unknown or unattributed, without any prespecifications or standardization of notes. Our method was inspired by word-frequency analysis methods used to uncover the true authorship of disputed works credited to William Shakespeare. We chose two use cases, ``transfusion'' and ``time-based.'' Transfusion was chosen because new transfusion AE types were becoming recognized during the study data period; therefore, we anticipated an opportunity to find unattributed potential AEs (PAEs) in the notes. With the time-based case, we wanted to simulate near real-time surveillance. We chose time periods in the hope of detecting PAEs due to contaminated heparin from mid-2007 to mid-2008 that were announced in early 2008. We hypothesized that the prevalence of contaminated heparin may have been widespread enough to manifest in EHRs through symptoms related to heparin AEs, independent of clinicians' documentation of attributed AEs. Objective: We aimed to develop a new method to identify attributed and unattributed PAEs using the unstructured text of EHRs. Methods: We used EHRs for adult critical care admissions at a major teaching hospital (2001-2012). For each case, we formed a group of interest and a comparison group. We concatenated the text notes for each admission into one document sorted by date, and deleted replicate sentences and lists. We identified statistically significant words in the group of interest versus the comparison group. Documents in the group of interest were filtered to those words, followed by topic modeling on the filtered documents to produce topics. For each topic, the three documents with the maximum topic scores were manually reviewed to identify PAEs. Results: Topics centered around medical conditions that were unique to or more common in the group of interest, including PAEs. In each use case, most PAEs were unattributed in the notes. Among the transfusion PAEs was unattributed evidence of transfusion-associated cardiac overload and transfusion-related acute lung injury. Some of the PAEs from mid-2007 to mid-2008 were increased unattributed events consistent with AEs related to heparin contamination. Conclusions: The Shakespeare method could be a useful supplement to AE reporting and surveillance of structured EHR data. Future improvements should include automation of the manual review process. ", doi="10.2196/27017", url="https://med.jmirx.org/2021/3/e27017", url="http://www.ncbi.nlm.nih.gov/pubmed/37725533" } @Article{info:doi/10.2196/27116, author="Tran, Thanh Huyen Thi and Lu, Shih-Hao and Tran, Thu Ha Thi and Nguyen, Van Bien", title="Social Media Insights During the COVID-19 Pandemic: Infodemiology Study Using Big Data", journal="JMIR Med Inform", year="2021", month="Jul", day="16", volume="9", number="7", pages="e27116", keywords="COVID-19", keywords="Vietnam", keywords="public attention", keywords="social media", keywords="infodemic", keywords="issue-attention cycle", keywords="media framing", keywords="big data", keywords="health crisis management", keywords="insight", keywords="infodemiology", keywords="infoveillance", keywords="social listening", abstract="Background: The COVID-19 pandemic is still undergoing complicated developments in Vietnam and around the world. There is a lot of information about the COVID-19 pandemic, especially on the internet where people can create and share information quickly. This can lead to an infodemic, which is a challenge every government might face in the fight against pandemics. Objective: This study aims to understand public attention toward the pandemic (from December 2019 to November 2020) through 7 types of sources: Facebook, Instagram, YouTube, blogs, news sites, forums, and e-commerce sites. Methods: We collected and analyzed nearly 38 million pieces of text data from the aforementioned sources via SocialHeat, a social listening (infoveillance) platform developed by YouNet Group. We described not only public attention volume trends, discussion sentiments, top sources, top posts that gained the most public attention, and hot keyword frequency but also hot keywords' co-occurrence as visualized by the VOSviewer software tool. Results: In this study, we reached four main conclusions. First, based on changing discussion trends regarding the COVID-19 subject, 7 periods were identified based on events that can be aggregated into two pandemic waves in Vietnam. Second, community pages on Facebook were the source of the most engagement from the public. However, the sources with the highest average interaction efficiency per article were government sources. Third, people's attitudes when discussing the pandemic have changed from negative to positive emotions. Fourth, the type of content that attracts the most interactions from people varies from time to time. Besides that, the issue-attention cycle theory occurred not only once but four times during the COVID-19 pandemic in Vietnam. Conclusions: Our study shows that online resources can help the government quickly identify public attention to public health messages during times of crisis. We also determined the hot spots that most interested the public and public attention communication patterns, which can help the government get practical information to make more effective policy reactions to help prevent the spread of the pandemic. ", doi="10.2196/27116", url="https://medinform.jmir.org/2021/7/e27116", url="http://www.ncbi.nlm.nih.gov/pubmed/34152994" } @Article{info:doi/10.2196/16750, author="Ahuja, Manik and Aseltine Jr, Robert", title="Barriers to Dissemination of Local Health Data Faced by US State Agencies: Survey Study of Behavioral Risk Factor Surveillance System Coordinators", journal="J Med Internet Res", year="2021", month="Jul", day="13", volume="23", number="7", pages="e16750", keywords="web-based data query systems, WDQS", keywords="health data", keywords="population health", keywords="dissemination of local health data", abstract="Background: Advances in information technology have paved the way to facilitate accessibility to population-level health data through web-based data query systems (WDQSs). Despite these advances in technology, US state agencies face many challenges related to the dissemination of their local health data. It is essential for the public to have access to high-quality data that are easy to interpret, reliable, and trusted. These challenges have been at the forefront throughout the COVID-19 pandemic. Objective: The purpose of this study is to identify the most significant challenges faced by state agencies, from the perspective of the Behavioral Risk Factor Surveillance System (BRFSS) coordinator from each state, and to assess if the coordinators from states with a WDQS perceive these challenges differently. Methods: We surveyed BRFSS coordinators (N=43) across all 50 US states and the District of Columbia. We surveyed the participants about contextual factors and asked them to rate system aspects and challenges they faced with their health data system on a Likert scale. We used two-sample t tests to compare the means of the ratings by participants from states with and without a WDQS. Results: Overall, 41/43 states (95\%) make health data available over the internet, while 65\% (28/43) employ a WDQS. States with a WDQS reported greater challenges (P=.01) related to the cost of hardware and software (mean score 3.44/4, 95\% CI 3.09-3.78) than states without a WDQS (mean score 2.63/4, 95\% CI 2.25-3.00). The system aspect of standardization of vocabulary scored more favorably (P=.01) in states with a WDQS (mean score 3.32/5, 95\% CI 2.94-3.69) than in states without a WDQS (mean score 2.85/5, 95\% CI 2.47-3.22). Conclusions: Securing of adequate resources and commitment to standardization are vital in the dissemination of local-level health data. Factors such as receiving data in a timely manner, privacy, and political opposition are less significant barriers than anticipated. ", doi="10.2196/16750", url="https://www.jmir.org/2021/7/e16750", url="http://www.ncbi.nlm.nih.gov/pubmed/34255650" } @Article{info:doi/10.2196/26290, author="Filos, Dimitris and Lekka, Irini and Kilintzis, Vasileios and Stefanopoulos, Leandros and Karavidopoulou, Youla and Maramis, Christos and Diou, Christos and Sarafis, Ioannis and Papapanagiotou, Vasileios and Alagialoglou, Leonidas and Ioakeimidis, Ioannis and Hassapidou, Maria and Charmandari, Evangelia and Heimeier, Rachel and O'Malley, Grace and O'Donnell, Shane and Doyle, Gerardine and Delopoulos, Anastasios and Maglaveras, Nicos", title="Exploring Associations Between Children's Obesogenic Behaviors and the Local Environment Using Big Data: Development and Evaluation of the Obesity Prevention Dashboard", journal="JMIR Mhealth Uhealth", year="2021", month="Jul", day="9", volume="9", number="7", pages="e26290", keywords="public health authorities", keywords="childhood obesity", keywords="children's behavior", keywords="environment", keywords="COVID-19", keywords="big data", keywords="mHealth", keywords="uHealth", keywords="intervention", abstract="Background: Obesity is a major public health problem globally and in Europe. The prevalence of childhood obesity is also soaring. Several parameters of the living environment are contributing to this increase, such as the density of fast food retailers, and thus, preventive health policies against childhood obesity must focus on the environment to which children are exposed. Currently, there are no systems in place to objectively measure the effect of living environment parameters on obesogenic behaviors and obesity. The H2020 project ``BigO: Big Data Against Childhood Obesity'' aims to tackle childhood obesity by creating new sources of evidence based on big data. Objective: This paper introduces the Obesity Prevention dashboard (OPdashboard), implemented in the context of BigO, which offers an interactive data platform for the exploration of objective obesity-related behaviors and local environments based on the data recorded using the BigO mHealth (mobile health) app. Methods: The OPdashboard, which can be accessed on the web, allows for (1) the real-time monitoring of children's obesogenic behaviors in a city area, (2) the extraction of associations between these behaviors and the local environment, and (3) the evaluation of interventions over time. More than 3700 children from 33 schools and 2 clinics in 5 European cities have been monitored using a custom-made mobile app created to extract behavioral patterns by capturing accelerometer and geolocation data. Online databases were assessed in order to obtain a description of the environment. The dashboard's functionality was evaluated during a focus group discussion with public health experts. Results: The preliminary association outcomes in 2 European cities, namely Thessaloniki, Greece, and Stockholm, Sweden, indicated a correlation between children's eating and physical activity behaviors and the availability of food-related places or sports facilities close to schools. In addition, the OPdashboard was used to assess changes to children's physical activity levels as a result of the health policies implemented to decelerate the COVID-19 outbreak. The preliminary outcomes of the analysis revealed that in urban areas the decrease in physical activity was statistically significant, while a slight increase was observed in the suburbs. These findings indicate the importance of the availability of open spaces for behavioral change in children. Discussions with public health experts outlined the dashboard's potential to aid in a better understanding of the interplay between children's obesogenic behaviors and the environment, and improvements were suggested. Conclusions: Our analyses serve as an initial investigation using the OPdashboard. Additional factors must be incorporated in order to optimize its use and obtain a clearer understanding of the results. The unique big data that are available through the OPdashboard can lead to the implementation of models that are able to predict population behavior. The OPdashboard can be considered as a tool that will increase our understanding of the underlying factors in childhood obesity and inform the design of regional interventions both for prevention and treatment. ", doi="10.2196/26290", url="https://mhealth.jmir.org/2021/7/e26290", url="http://www.ncbi.nlm.nih.gov/pubmed/34048353" } @Article{info:doi/10.2196/29614, author="Viberg Johansson, Jennifer and Bentzen, Beate Heidi and Shah, Nisha and Haraldsd{\'o}ttir, Eik and J{\'o}nsd{\'o}ttir, Andrea Gu?bj{\"o}rg and Kaye, Jane and Mascalzoni, Deborah and Veldwijk, Jorien", title="Preferences of the Public for Sharing Health Data: Discrete Choice Experiment", journal="JMIR Med Inform", year="2021", month="Jul", day="5", volume="9", number="7", pages="e29614", keywords="preferences", keywords="discrete choice experiment", keywords="health data", keywords="secondary use", keywords="willingness to share", abstract="Background: Digital technological development in the last 20 years has led to significant growth in digital collection, use, and sharing of health data. To maintain public trust in the digital society and to enable acceptable policy-making in the future, it is important to investigate people's preferences for sharing digital health data. Objective: The aim of this study is to elicit the preferences of the public in different Northern European countries (the United Kingdom, Norway, Iceland, and Sweden) for sharing health information in different contexts. Methods: Respondents in this discrete choice experiment completed several choice tasks, in which they were asked if data sharing in the described hypothetical situation was acceptable to them. Latent class logistic regression models were used to determine attribute-level estimates and heterogeneity in preferences. We calculated the relative importance of the attributes and the predicted acceptability for different contexts in which the data were shared from the estimates. Results: In the final analysis, we used 37.83\% (1967/5199) questionnaires. All attributes influenced the respondents' willingness to share health information (P<.001). The most important attribute was whether the respondents were informed about their data being shared. The possibility of opting out from sharing data was preferred over the opportunity to consent (opt-in). Four classes were identified in the latent class model, and the average probabilities of belonging were 27\% for class 1, 32\% for class 2, 23\% for class 3, and 18\% for class 4. The uptake probability varied between 14\% and 85\%, depending on the least to most preferred combination of levels. Conclusions: Respondents from different countries have different preferences for sharing their health data regarding the value of a review process and the reason for their new use. Offering respondents information about the use of their data and the possibility to opt out is the most preferred governance mechanism. ", doi="10.2196/29614", url="https://medinform.jmir.org/2021/7/e29614", url="http://www.ncbi.nlm.nih.gov/pubmed/36260402" } @Article{info:doi/10.2196/25266, author="Schmit, Cason and Giannouchos, Theodoros and Ramezani, Mahin and Zheng, Qi and Morrisey, A. Michael and Kum, Hye-Chung", title="US Privacy Laws Go Against Public Preferences and Impede Public Health and Research: Survey Study", journal="J Med Internet Res", year="2021", month="Jul", day="5", volume="23", number="7", pages="e25266", keywords="privacy", keywords="law", keywords="medical informatics", keywords="conjoint analysis", keywords="surveys and questionnaires", keywords="public health", keywords="information dissemination", keywords="health policy", keywords="public policy", keywords="big data", abstract="Background: Reaping the benefits from massive volumes of data collected in all sectors to improve population health, inform personalized medicine, and transform biomedical research requires the delicate balance between the benefits and risks of using individual-level data. There is a patchwork of US data protection laws that vary depending on the type of data, who is using it, and their intended purpose. Differences in these laws challenge big data projects using data from different sources. The decisions to permit or restrict data uses are determined by elected officials; therefore, constituent input is critical to finding the right balance between individual privacy and public benefits. Objective: This study explores the US public's preferences for using identifiable data for different purposes without their consent. Methods: We measured data use preferences of a nationally representative sample of 504 US adults by conducting a web-based survey in February 2020. The survey used a choice-based conjoint analysis. We selected choice-based conjoint attributes and levels based on 5 US data protection laws (Health Insurance Portability and Accountability Act, Family Educational Rights and Privacy Act, Privacy Act of 1974, Federal Trade Commission Act, and the Common Rule). There were 72 different combinations of attribute levels, representing different data use scenarios. Participants were given 12 pairs of data use scenarios and were asked to choose the scenario they were the most comfortable with. We then simulated the population preferences by using the hierarchical Bayes regression model using the ChoiceModelR package in R. Results: Participants strongly preferred data reuse for public health and research than for profit-driven, marketing, or crime-detection activities. Participants also strongly preferred data use by universities or nonprofit organizations over data use by businesses and governments. Participants were fairly indifferent about the different types of data used (health, education, government, or economic data). Conclusions: Our results show a notable incongruence between public preferences and current US data protection laws. Our findings appear to show that the US public favors data uses promoting social benefits over those promoting individual or organizational interests. This study provides strong support for continued efforts to provide safe access to useful data sets for research and public health. Policy makers should consider more robust public health and research data use exceptions to align laws with public preferences. In addition, policy makers who revise laws to enable data use for research and public health should consider more comprehensive protection mechanisms, including transparent use of data and accountability. ", doi="10.2196/25266", url="https://www.jmir.org/2021/7/e25266", url="http://www.ncbi.nlm.nih.gov/pubmed/36260399" } @Article{info:doi/10.2196/25482, author="Feusner, D. Jamie and Mohideen, Reza and Smith, Stephen and Patanam, Ilyas and Vaitla, Anil and Lam, Christopher and Massi, Michelle and Leow, Alex", title="Semantic Linkages of Obsessions From an International Obsessive-Compulsive Disorder Mobile App Data Set: Big Data Analytics Study", journal="J Med Internet Res", year="2021", month="Jun", day="21", volume="23", number="6", pages="e25482", keywords="OCD", keywords="natural language processing", keywords="clinical subtypes", keywords="semantic", keywords="word embedding", keywords="clustering", abstract="Background: Obsessive-compulsive disorder (OCD) is characterized by recurrent intrusive thoughts, urges, or images (obsessions) and repetitive physical or mental behaviors (compulsions). Previous factor analytic and clustering studies suggest the presence of three or four subtypes of OCD symptoms. However, these studies have relied on predefined symptom checklists, which are limited in breadth and may be biased toward researchers' previous conceptualizations of OCD. Objective: In this study, we examine a large data set of freely reported obsession symptoms obtained from an OCD mobile app as an alternative to uncovering potential OCD subtypes. From this, we examine data-driven clusters of obsessions based on their latent semantic relationships in the English language using word embeddings. Methods: We extracted free-text entry words describing obsessions in a large sample of users of a mobile app, NOCD. Semantic vector space modeling was applied using the Global Vectors for Word Representation algorithm. A domain-specific extension, Mittens, was also applied to enhance the corpus with OCD-specific words. The resulting representations provided linear substructures of the word vector in a 100-dimensional space. We applied principal component analysis to the 100-dimensional vector representation of the most frequent words, followed by k-means clustering to obtain clusters of related words. Results: We obtained 7001 unique words representing obsessions from 25,369 individuals. Heuristics for determining the optimal number of clusters pointed to a three-cluster solution for grouping subtypes of OCD. The first had themes relating to relationship and just-right; the second had themes relating to doubt and checking; and the third had themes relating to contamination, somatic, physical harm, and sexual harm. All three clusters showed close semantic relationships with each other in the central area of convergence, with themes relating to harm. An equal-sized split-sample analysis across individuals and a split-sample analysis over time both showed overall stable cluster solutions. Words in the third cluster were the most frequently occurring words, followed by words in the first cluster. Conclusions: The clustering of naturally acquired obsessional words resulted in three major groupings of semantic themes, which partially overlapped with predefined checklists from previous studies. Furthermore, the closeness of the overall embedded relationships across clusters and their central convergence on harm suggests that, at least at the level of self-reported obsessional thoughts, most obsessions have close semantic relationships. Harm to self or others may be an underlying organizing theme across many obsessions. Notably, relationship-themed words, not previously included in factor-analytic studies, clustered with just-right words. These novel insights have potential implications for understanding how an apparent multitude of obsessional symptoms are connected by underlying themes. This observation could aid exposure-based treatment approaches and could be used as a conceptual framework for future research. ", doi="10.2196/25482", url="https://www.jmir.org/2021/6/e25482", url="http://www.ncbi.nlm.nih.gov/pubmed/33892466" } @Article{info:doi/10.2196/27976, author="Miller, Michele and Romine, William and Oroszi, Terry", title="Public Discussion of Anthrax on Twitter: Using Machine Learning to Identify Relevant Topics and Events", journal="JMIR Public Health Surveill", year="2021", month="Jun", day="18", volume="7", number="6", pages="e27976", keywords="anthrax", keywords="big data", keywords="internet", keywords="infodemiology", keywords="infoveillance", keywords="social listening", keywords="digital health", keywords="biological weapon", keywords="terrorism", keywords="Federal Bureau of Investigation", keywords="machine learning", keywords="public health threat", keywords="Twitter", abstract="Background: Social media allows researchers to study opinions and reactions to events in real time. One area needing more study is anthrax-related events. A computational framework that utilizes machine learning techniques was created to collect tweets discussing anthrax, further categorize them as relevant by the month of data collection, and detect discussions on anthrax-related events. Objective: The objective of this study was to detect discussions on anthrax-related events and to determine the relevance of the tweets and topics of discussion over 12 months of data collection. Methods: This is an infoveillance study, using tweets in English containing the keyword ``Anthrax'' and ``Bacillus anthracis'', collected from September 25, 2017, through August 15, 2018. Machine learning techniques were used to determine what people were tweeting about anthrax. Data over time was plotted to determine whether an event was detected (a 3-fold spike in tweets). A machine learning classifier was created to categorize tweets by relevance to anthrax. Relevant tweets by month were examined using a topic modeling approach to determine the topics of discussion over time and how these events influence that discussion. Results: Over the 12 months of data collection, a total of 204,008 tweets were collected. Logistic regression analysis revealed the best performance for relevance (precision=0.81; recall=0.81; F1-score=0.80). In total, 26 topics were associated with anthrax-related events, tweets that were highly retweeted, natural outbreaks, and news stories. Conclusions: This study shows that tweets related to anthrax can be collected and analyzed over time to determine what people are discussing and to detect key anthrax-related events. Future studies are required to focus only on opinion tweets, use the methodology to study other terrorism events, or to monitor for terrorism threats. ", doi="10.2196/27976", url="https://publichealth.jmir.org/2021/6/e27976", url="http://www.ncbi.nlm.nih.gov/pubmed/34142975" } @Article{info:doi/10.2196/26681, author="Blitz, Rog{\'e}rio and Storck, Michael and Baune, T. Bernhard and Dugas, Martin and Opel, Nils", title="Design and Implementation of an Informatics Infrastructure for Standardized Data Acquisition, Transfer, Storage, and Export in Psychiatric Clinical Routine: Feasibility Study", journal="JMIR Ment Health", year="2021", month="Jun", day="9", volume="8", number="6", pages="e26681", keywords="medical informatics", keywords="digital mental health", keywords="digital data collection", keywords="psychiatry", keywords="single-source metadata architecture transformation", keywords="mental health", keywords="design", keywords="implementation", keywords="feasibility", keywords="informatics", keywords="infrastructure", keywords="data", abstract="Background: Empirically driven personalized diagnostic applications and treatment stratification is widely perceived as a major hallmark in psychiatry. However, databased personalized decision making requires standardized data acquisition and data access, which are currently absent in psychiatric clinical routine. Objective: Here, we describe the informatics infrastructure implemented at the psychiatric M{\"u}nster University Hospital, which allows standardized acquisition, transfer, storage, and export of clinical data for future real-time predictive modelling in psychiatric routine. Methods: We designed and implemented a technical architecture that includes an extension of the electronic health record (EHR) via scalable standardized data collection and data transfer between EHRs and research databases, thus allowing the pooling of EHRs and research data in a unified database and technical solutions for the visual presentation of collected data and analyses results in the EHR. The Single-source Metadata ARchitecture Transformation (SMA:T) was used as the software architecture. SMA:T is an extension of the EHR system and uses module-driven engineering to generate standardized applications and interfaces. The operational data model was used as the standard. Standardized data were entered on iPads via the Mobile Patient Survey (MoPat) and the web application Mopat@home, and the standardized transmission, processing, display, and export of data were realized via SMA:T. Results: The technical feasibility of the informatics infrastructure was demonstrated in the course of this study. We created 19 standardized documentation forms with 241 items. For 317 patients, 6451 instances were automatically transferred to the EHR system without errors. Moreover, 96,323 instances were automatically transferred from the EHR system to the research database for further analyses. Conclusions: In this study, we present the successful implementation of the informatics infrastructure enabling standardized data acquisition and data access for future real-time predictive modelling in clinical routine in psychiatry. The technical solution presented here might guide similar initiatives at other sites and thus help to pave the way toward future application of predictive models in psychiatric clinical routine. ", doi="10.2196/26681", url="https://mental.jmir.org/2021/6/e26681", url="http://www.ncbi.nlm.nih.gov/pubmed/34106072" } @Article{info:doi/10.2196/29405, author="Izquierdo, Luis Jose and Soriano, B. Joan", title="Authors' Reply to: Minimizing Selection and Classification Biases Comment on ``Clinical Characteristics and Prognostic Factors for Intensive Care Unit Admission of Patients With COVID-19: Retrospective Study Using Machine Learning and Natural Language Processing''", journal="J Med Internet Res", year="2021", month="May", day="26", volume="23", number="5", pages="e29405", keywords="artificial intelligence", keywords="big data", keywords="COVID-19", keywords="electronic health records", keywords="tachypnea", keywords="SARS-CoV-2", keywords="predictive model", keywords="prognosis", keywords="classification bias", keywords="critical care", doi="10.2196/29405", url="https://www.jmir.org/2021/5/e29405", url="http://www.ncbi.nlm.nih.gov/pubmed/33989164" } @Article{info:doi/10.2196/27142, author="Martos P{\'e}rez, Francisco and Gomez Huelgas, Ricardo and Mart{\'i}n Escalante, Dolores Mar{\'i}a and Casas Rojo, Manuel Jos{\'e}", title="Minimizing Selection and Classification Biases. Comment on ``Clinical Characteristics and Prognostic Factors for Intensive Care Unit Admission of Patients With COVID-19: Retrospective Study Using Machine Learning and Natural Language Processing''", journal="J Med Internet Res", year="2021", month="May", day="26", volume="23", number="5", pages="e27142", keywords="artificial intelligence", keywords="big data", keywords="COVID-19", keywords="electronic health records", keywords="tachypnea", keywords="SARS-CoV-2", keywords="predictive model", keywords="prognosis", keywords="classification bias", keywords="critical care", doi="10.2196/27142", url="https://www.jmir.org/2021/5/e27142", url="http://www.ncbi.nlm.nih.gov/pubmed/33989163" } @Article{info:doi/10.2196/21668, author="Park, Rang Yu and Kim, Hyery and Park, Ae Ji and Ahn, Hyun Sang and Chang, Seyun and Shin, Won Jae and Kim, Myeongchan and Lee, Jae-Ho", title="Comparative Analysis of Single and Combined Antipyretics Using Patient-Generated Health Data: Retrospective Observational Study", journal="JMIR Mhealth Uhealth", year="2021", month="May", day="26", volume="9", number="5", pages="e21668", keywords="combination antipyretics", keywords="fever management", keywords="patient-generated health data", keywords="comparative analysis", keywords="mHealth", keywords="apps", keywords="fever", abstract="Background: Fever is one of the most common symptoms in children and is the physiological response of the human immune system to external pathogens. However, effectiveness studies of single and combined antipyretic therapy are relatively few due to lack of data. In this study, we used large-scale patient-generated health data from mobile apps to compare antipyretic affects between single and combination antipyretics. Objective: We aimed to establish combination patterns of antipyretics and compare antipyretic affects between single and combination antipyretics using large-scale patient-generated health data from mobile apps. Methods: This study was conducted using medical records of feverish children from July 2015 to June 2017 using the Fever Coach mobile app. In total, 3,584,748 temperature records and 1,076,002 antipyretic records of 104,337 children were analyzed. Antipyretic efficacy was measured by the mean difference in the area under the temperature change curve from baseline for 6 hours, 8 hours, 10 hours, and 12 hours after antipyretic administration in children with a body temperature of ?38.0 {\textcelcius} between single and combination groups. Results: The single antipyretic and combination groups comprised 152,017 and 54,842 cases, respectively. Acetaminophen was the most commonly used single agent (60,929/152,017, 40.08\%), and acetaminophen plus dexibuprofen was the most common combination (28,065/54,842, 51.17\%). We observed inappropriate use, including triple combination (1205/206,859, 0.58\%) and use under 38 {\textcelcius} (11,361/206,859, 5.50\%). Combination antipyretic use increased with temperature; 23.82\% (33,379/140,160) of cases were given a combination treatment when 38 {\textcelcius} ? temperature < 39 {\textcelcius}, while 41.40\% (1517/3664) were given a combination treatment when 40 {\textcelcius} ? temperature. The absolute value of the area under the curve at each hour was significantly higher in the single group than in the combination group; this trend was consistently observed, regardless of the type of antipyretics. In particular, the delta fever during the first 6 hours between the two groups showed the highest difference. The combination showed the lowest delta fever among all cases. Conclusions: Antipyretics combination patterns were analyzed using large-scale data. Approximately 75\% of febrile cases used single antipyretics, mostly acetaminophen, but combination usage became more frequent as temperature increased. However, combination antipyretics did not show definite advantages over single antipyretics in defervescence, regardless of the combination. Single antipyretics are effective in reducing fever and relieving discomfort in febrile children. ", doi="10.2196/21668", url="https://mhealth.jmir.org/2021/5/e21668", url="http://www.ncbi.nlm.nih.gov/pubmed/34037528" } @Article{info:doi/10.2196/25714, author="Vaghela, Uddhav and Rabinowicz, Simon and Bratsos, Paris and Martin, Guy and Fritzilas, Epameinondas and Markar, Sheraz and Purkayastha, Sanjay and Stringer, Karl and Singh, Harshdeep and Llewellyn, Charlie and Dutta, Debabrata and Clarke, M. Jonathan and Howard, Matthew and and Serban, Ovidiu and Kinross, James", title="Using a Secure, Continually Updating, Web Source Processing Pipeline to Support the Real-Time Data Synthesis and Analysis of Scientific Literature: Development and Validation Study", journal="J Med Internet Res", year="2021", month="May", day="6", volume="23", number="5", pages="e25714", keywords="structured data synthesis", keywords="data science", keywords="critical analysis", keywords="web crawl data", keywords="pipeline", keywords="database", keywords="literature", keywords="research", keywords="COVID-19", keywords="infodemic", keywords="decision making", keywords="data", keywords="data synthesis", keywords="misinformation", keywords="infrastructure", keywords="methodology", abstract="Background: The scale and quality of the global scientific response to the COVID-19 pandemic have unquestionably saved lives. However, the COVID-19 pandemic has also triggered an unprecedented ``infodemic''; the velocity and volume of data production have overwhelmed many key stakeholders such as clinicians and policy makers, as they have been unable to process structured and unstructured data for evidence-based decision making. Solutions that aim to alleviate this data synthesis--related challenge are unable to capture heterogeneous web data in real time for the production of concomitant answers and are not based on the high-quality information in responses to a free-text query. Objective: The main objective of this project is to build a generic, real-time, continuously updating curation platform that can support the data synthesis and analysis of a scientific literature framework. Our secondary objective is to validate this platform and the curation methodology for COVID-19--related medical literature by expanding the COVID-19 Open Research Dataset via the addition of new, unstructured data. Methods: To create an infrastructure that addresses our objectives, the PanSurg Collaborative at Imperial College London has developed a unique data pipeline based on a web crawler extraction methodology. This data pipeline uses a novel curation methodology that adopts a human-in-the-loop approach for the characterization of quality, relevance, and key evidence across a range of scientific literature sources. Results: REDASA (Realtime Data Synthesis and Analysis) is now one of the world's largest and most up-to-date sources of COVID-19--related evidence; it consists of 104,000 documents. By capturing curators' critical appraisal methodologies through the discrete labeling and rating of information, REDASA rapidly developed a foundational, pooled, data science data set of over 1400 articles in under 2 weeks. These articles provide COVID-19--related information and represent around 10\% of all papers about COVID-19. Conclusions: This data set can act as ground truth for the future implementation of a live, automated systematic review. The three benefits of REDASA's design are as follows: (1) it adopts a user-friendly, human-in-the-loop methodology by embedding an efficient, user-friendly curation platform into a natural language processing search engine; (2) it provides a curated data set in the JavaScript Object Notation format for experienced academic reviewers' critical appraisal choices and decision-making methodologies; and (3) due to the wide scope and depth of its web crawling method, REDASA has already captured one of the world's largest COVID-19--related data corpora for searches and curation. ", doi="10.2196/25714", url="https://www.jmir.org/2021/5/e25714", url="http://www.ncbi.nlm.nih.gov/pubmed/33835932" } @Article{info:doi/10.2196/27275, author="Borges do Nascimento, J{\'u}nior Israel and Marcolino, Soriano Milena and Abdulazeem, Mohamed Hebatullah and Weerasekara, Ishanka and Azzopardi-Muscat, Natasha and Gon{\c{c}}alves, Andr{\'e} Marcos and Novillo-Ortiz, David", title="Impact of Big Data Analytics on People's Health: Overview of Systematic Reviews and Recommendations for Future Studies", journal="J Med Internet Res", year="2021", month="Apr", day="13", volume="23", number="4", pages="e27275", keywords="public health", keywords="big data", keywords="health status", keywords="evidence-based medicine", keywords="big data analytics", keywords="secondary data analysis", keywords="machine learning", keywords="systematic review", keywords="overview", keywords="World Health Organization", abstract="Background: Although the potential of big data analytics for health care is well recognized, evidence is lacking on its effects on public health. Objective: The aim of this study was to assess the impact of the use of big data analytics on people's health based on the health indicators and core priorities in the World Health Organization (WHO) General Programme of Work 2019/2023 and the European Programme of Work (EPW), approved and adopted by its Member States, in addition to SARS-CoV-2--related studies. Furthermore, we sought to identify the most relevant challenges and opportunities of these tools with respect to people's health. Methods: Six databases (MEDLINE, Embase, Cochrane Database of Systematic Reviews via Cochrane Library, Web of Science, Scopus, and Epistemonikos) were searched from the inception date to September 21, 2020. Systematic reviews assessing the effects of big data analytics on health indicators were included. Two authors independently performed screening, selection, data extraction, and quality assessment using the AMSTAR-2 (A Measurement Tool to Assess Systematic Reviews 2) checklist. Results: The literature search initially yielded 185 records, 35 of which met the inclusion criteria, involving more than 5,000,000 patients. Most of the included studies used patient data collected from electronic health records, hospital information systems, private patient databases, and imaging datasets, and involved the use of big data analytics for noncommunicable diseases. ``Probability of dying from any of cardiovascular, cancer, diabetes or chronic renal disease'' and ``suicide mortality rate'' were the most commonly assessed health indicators and core priorities within the WHO General Programme of Work 2019/2023 and the EPW 2020/2025. Big data analytics have shown moderate to high accuracy for the diagnosis and prediction of complications of diabetes mellitus as well as for the diagnosis and classification of mental disorders; prediction of suicide attempts and behaviors; and the diagnosis, treatment, and prediction of important clinical outcomes of several chronic diseases. Confidence in the results was rated as ``critically low'' for 25 reviews, as ``low'' for 7 reviews, and as ``moderate'' for 3 reviews. The most frequently identified challenges were establishment of a well-designed and structured data source, and a secure, transparent, and standardized database for patient data. Conclusions: Although the overall quality of included studies was limited, big data analytics has shown moderate to high accuracy for the diagnosis of certain diseases, improvement in managing chronic diseases, and support for prompt and real-time analyses of large sets of varied input data to diagnose and predict disease outcomes. Trial Registration: International Prospective Register of Systematic Reviews (PROSPERO) CRD42020214048; https://www.crd.york.ac.uk/prospero/display\_record.php?RecordID=214048 ", doi="10.2196/27275", url="https://www.jmir.org/2021/4/e27275", url="http://www.ncbi.nlm.nih.gov/pubmed/33847586" } @Article{info:doi/10.2196/24656, author="Chatterjee, Ayan and Prinz, Andreas and Gerdes, Martin and Martinez, Santiago", title="An Automatic Ontology-Based Approach to Support Logical Representation of Observable and Measurable Data for Healthy Lifestyle Management: Proof-of-Concept Study", journal="J Med Internet Res", year="2021", month="Apr", day="9", volume="23", number="4", pages="e24656", keywords="activity", keywords="nutrition", keywords="sensor", keywords="questionnaire", keywords="SSN", keywords="ontology", keywords="SNOMED CT", keywords="eCoach", keywords="personalized", keywords="recommendation", keywords="automated", keywords="CDSS", keywords="healthy lifestyle", keywords="interoperability", keywords="eHealth", keywords="goal setting", keywords="semantics", keywords="simulation", keywords="proposition", abstract="Background: Lifestyle diseases, because of adverse health behavior, are the foremost cause of death worldwide. An eCoach system may encourage individuals to lead a healthy lifestyle with early health risk prediction, personalized recommendation generation, and goal evaluation. Such an eCoach system needs to collect and transform distributed heterogenous health and wellness data into meaningful information to train an artificially intelligent health risk prediction model. However, it may produce a data compatibility dilemma. Our proposed eHealth ontology can increase interoperability between different heterogeneous networks, provide situation awareness, help in data integration, and discover inferred knowledge. This ``proof-of-concept'' study will help sensor, questionnaire, and interview data to be more organized for health risk prediction and personalized recommendation generation targeting obesity as a study case. Objective: The aim of this study is to develop an OWL-based ontology (UiA eHealth Ontology/UiAeHo) model to annotate personal, physiological, behavioral, and contextual data from heterogeneous sources (sensor, questionnaire, and interview), followed by structuring and standardizing of diverse descriptions to generate meaningful, practical, personalized, and contextual lifestyle recommendations based on the defined rules. Methods: We have developed a simulator to collect dummy personal, physiological, behavioral, and contextual data related to artificial participants involved in health monitoring. We have integrated the concepts of ``Semantic Sensor Network Ontology'' and ``Systematized Nomenclature of Medicine---Clinical Terms'' to develop our proposed eHealth ontology. The ontology has been created using Prot{\'e}g{\'e} (version 5.x). We have used the Java-based ``Jena Framework'' (version 3.16) for building a semantic web application that includes resource description framework (RDF) application programming interface (API), OWL API, native tuple store (tuple database), and the SPARQL (Simple Protocol and RDF Query Language) query engine. The logical and structural consistency of the proposed ontology has been evaluated with the ``HermiT 1.4.3.x'' ontology reasoner available in Prot{\'e}g{\'e} 5.x. Results: The proposed ontology has been implemented for the study case ``obesity.'' However, it can be extended further to other lifestyle diseases. ``UiA eHealth Ontology'' has been constructed using logical axioms, declaration axioms, classes, object properties, and data properties. The ontology can be visualized with ``Owl Viz,'' and the formal representation has been used to infer a participant's health status using the ``HermiT'' reasoner. We have also developed a module for ontology verification that behaves like a rule-based decision support system to predict the probability for health risk, based on the evaluation of the results obtained from SPARQL queries. Furthermore, we discussed the potential lifestyle recommendation generation plan against adverse behavioral risks. Conclusions: This study has led to the creation of a meaningful, context-specific ontology to model massive, unintuitive, raw, unstructured observations for health and wellness data (eg, sensors, interviews, questionnaires) and to annotate them with semantic metadata to create a compact, intelligible abstraction for health risk predictions for individualized recommendation generation. ", doi="10.2196/24656", url="https://www.jmir.org/2021/4/e24656", url="http://www.ncbi.nlm.nih.gov/pubmed/33835031" } @Article{info:doi/10.2196/22645, author="Kammrath Betancor, Paola and Tizek, Linda and Zink, Alexander and Reinhard, Thomas and B{\"o}hringer, Daniel", title="Estimating the Incidence of Conjunctivitis by Comparing the Frequency of Google Search Terms With Clinical Data: Retrospective Study", journal="JMIR Public Health Surveill", year="2021", month="Mar", day="3", volume="7", number="3", pages="e22645", keywords="epidemic keratoconjunctivitis", keywords="big data", keywords="Google search", keywords="Freiburg clinical data", abstract="Background: Infectious conjunctivitis is contagious and may lead to an outbreak. Prevention systems can help to avoid an outbreak. Objective: We aimed to evaluate if Google search data on conjunctivitis and associated terms can be used to estimate the incidence and if the data can provide an estimation for outbreaks. Methods: We obtained Google search data over 4 years for the German term for conjunctivitis (``Bindehautentz{\"u}ndung'') and 714 associated terms in 12 selected German cities and Germany as a whole using the Google AdWords Keyword Planner. The search volume from Freiburg was correlated with clinical data from the Freiburg emergency practice (Eye Center University of Freiburg). Results: The search volume for the German term for conjunctivitis in Germany as a whole and in the 12 German cities showed a highly uniform seasonal pattern. Cross-correlation between the temporal search frequencies in Germany as a whole and the 12 selected cities was high without any lag. Cross-correlation of the search volume in Freiburg with the frequency of conjunctivitis (International Statistical Classification of Diseases and Related Health Problems [ICD] code group ``H10.-'') from the centralized ophthalmologic emergency practice in Freiburg revealed a considerable temporal association, with the emergency practice lagging behind the frequency. Additionally, Pearson correlation between the count of patients per month and the count of searches per month in Freiburg was statistically significant (P=.04). Conclusions: We observed a close correlation between the Google search volume for the signs and symptoms of conjunctivitis and the frequency of patients with a congruent diagnosis in the Freiburg region. Regional deviations from the nationwide average search volume may therefore indicate a regional outbreak of infectious conjunctivitis. ", doi="10.2196/22645", url="https://publichealth.jmir.org/2021/3/e22645", url="http://www.ncbi.nlm.nih.gov/pubmed/33656450" } @Article{info:doi/10.2196/21679, author="Parikh, Soham and Davoudi, Anahita and Yu, Shun and Giraldo, Carolina and Schriver, Emily and Mowery, Danielle", title="Lexicon Development for COVID-19-related Concepts Using Open-source Word Embedding Sources: An Intrinsic and Extrinsic Evaluation", journal="JMIR Med Inform", year="2021", month="Feb", day="22", volume="9", number="2", pages="e21679", keywords="natural language processing", keywords="word embedding", keywords="COVID-19", keywords="intrinsic", keywords="open-source", keywords="computation", keywords="model", keywords="prediction", keywords="semantic", keywords="syntactic", keywords="pattern", abstract="Background: Scientists are developing new computational methods and prediction models to better clinically understand COVID-19 prevalence, treatment efficacy, and patient outcomes. These efforts could be improved by leveraging documented COVID-19--related symptoms, findings, and disorders from clinical text sources in an electronic health record. Word embeddings can identify terms related to these clinical concepts from both the biomedical and nonbiomedical domains, and are being shared with the open-source community at large. However, it's unclear how useful openly available word embeddings are for developing lexicons for COVID-19--related concepts. Objective: Given an initial lexicon of COVID-19--related terms, this study aims to characterize the returned terms by similarity across various open-source word embeddings and determine common semantic and syntactic patterns between the COVID-19 queried terms and returned terms specific to the word embedding source. Methods: We compared seven openly available word embedding sources. Using a series of COVID-19--related terms for associated symptoms, findings, and disorders, we conducted an interannotator agreement study to determine how accurately the most similar returned terms could be classified according to semantic types by three annotators. We conducted a qualitative study of COVID-19 queried terms and their returned terms to detect informative patterns for constructing lexicons. We demonstrated the utility of applying such learned synonyms to discharge summaries by reporting the proportion of patients identified by concept among three patient cohorts: pneumonia (n=6410), acute respiratory distress syndrome (n=8647), and COVID-19 (n=2397). Results: We observed high pairwise interannotator agreement (Cohen kappa) for symptoms (0.86-0.99), findings (0.93-0.99), and disorders (0.93-0.99). Word embedding sources generated based on characters tend to return more synonyms (mean count of 7.2 synonyms) compared to token-based embedding sources (mean counts range from 2.0 to 3.4). Word embedding sources queried using a qualifier term (eg, dry cough or muscle pain) more often returned qualifiers of the similar semantic type (eg, ``dry'' returns consistency qualifiers like ``wet'' and ``runny'') compared to a single term (eg, cough or pain) queries. A higher proportion of patients had documented fever (0.61-0.84), cough (0.41-0.55), shortness of breath (0.40-0.59), and hypoxia (0.51-0.56) retrieved than other clinical features. Terms for dry cough returned a higher proportion of patients with COVID-19 (0.07) than the pneumonia (0.05) and acute respiratory distress syndrome (0.03) populations. Conclusions: Word embeddings are valuable technology for learning related terms, including synonyms. When leveraging openly available word embedding sources, choices made for the construction of the word embeddings can significantly influence the words learned. ", doi="10.2196/21679", url="https://medinform.jmir.org/2021/2/e21679", url="http://www.ncbi.nlm.nih.gov/pubmed/33544689" } @Article{info:doi/10.2196/18840, author="Walkey, J. Allan and Bashar, K. Syed and Hossain, Billal Md and Ding, Eric and Albuquerque, Daniella and Winter, Michael and Chon, H. Ki and McManus, D. David", title="Development and Validation of an Automated Algorithm to Detect Atrial Fibrillation Within Stored Intensive Care Unit Continuous Electrocardiographic Data: Observational Study", journal="JMIR Cardio", year="2021", month="Feb", day="15", volume="5", number="1", pages="e18840", keywords="atrial fibrillation", keywords="sepsis", keywords="intensive care unit", keywords="big data", keywords="data science", abstract="Background: Atrial fibrillation (AF) is the most common arrhythmia during critical illness, representing a sepsis-defining cardiac dysfunction associated with adverse outcomes. Large burdens of premature beats and noisy signal during sepsis may pose unique challenges to automated AF detection. Objective: The objective of this study is to develop and validate an automated algorithm to accurately identify AF within electronic health care data among critically ill patients with sepsis. Methods: This is a retrospective cohort study of patients hospitalized with sepsis identified from Medical Information Mart for Intensive Care (MIMIC III) electronic health data with linked electrocardiographic (ECG) telemetry waveforms. Within 3 separate cohorts of 50 patients, we iteratively developed and validated an automated algorithm that identifies ECG signals, removes noise, and identifies irregular rhythm and premature beats in order to identify AF. We compared the automated algorithm to current methods of AF identification in large databases, including ICD-9 (International Classification of Diseases, 9th edition) codes and hourly nurse annotation of heart rhythm. Methods of AF identification were tested against gold-standard manual ECG review. Results: AF detection algorithms that did not differentiate AF from premature atrial and ventricular beats performed modestly, with 76\% (95\% CI 61\%-87\%) accuracy. Performance improved (P=.02) with the addition of premature beat detection (validation set accuracy: 94\% [95\% CI 83\%-99\%]). Median time between automated and manual detection of AF onset was 30 minutes (25th-75th percentile 0-208 minutes). The accuracy of ICD-9 codes (68\%; P=.002 vs automated algorithm) and nurse charting (80\%; P=.02 vs algorithm) was lower than that of the automated algorithm. Conclusions: An automated algorithm using telemetry ECG data can feasibly and accurately detect AF among critically ill patients with sepsis, and represents an improvement in AF detection within large databases. ", doi="10.2196/18840", url="http://cardio.jmir.org/2021/1/e18840/", url="http://www.ncbi.nlm.nih.gov/pubmed/33587041" } @Article{info:doi/10.2196/20225, author="Rivera-Rodriguez, Claudia and Cheung, Gary and Cullum, Sarah", title="Using Big Data to Estimate Dementia Prevalence in New Zealand: Protocol for an Observational Study", journal="JMIR Res Protoc", year="2021", month="Jan", day="6", volume="10", number="1", pages="e20225", keywords="routinely collected data", keywords="repeated measures", keywords="dementia", keywords="Alzheimer disease", keywords="modeling", keywords="complex sampling \emspace", abstract="Background: Dementia describes a cluster of symptoms that includes memory loss; difficulties with thinking, problem solving, or language; and functional impairment. Dementia can be caused by a number of neurodegenerative diseases, such as Alzheimer disease and cerebrovascular disease. Currently in New Zealand, most of the systematically collected and detailed information on dementia is obtained through a suite of International Residential Assessment Instrument (interRAI) assessments, including the home care, contact assessment, and long-term care facility versions. These versions of interRAI are standardized comprehensive geriatric assessments. Patients are referred to have an interRAI assessment by the Needs Assessment and Service Coordination (NASC) services after a series of screening processes. Previous estimates of the prevalence and costs of dementia in New Zealand have been based on international studies with different populations and health and social care systems. This new local knowledge will have implications for estimating the demographic distribution and socioeconomic impact of dementia in New Zealand. Objective: This study investigates the prevalence of dementia, risk factors for dementia, and drivers of the informal cost of dementia among people registered in the NASC database in New Zealand. Methods: This study aims to analyze secondary data routinely collected by the NASC and interRAI (home care and contact assessment versions) databases between July 1, 2014, and July 1, 2019, in New Zealand. The databases will be linked to produce an integrated data set, which will be used to (1) investigate the sociodemographic and clinical risk factors associated with dementia and other neurological conditions, (2) estimate the prevalence of dementia using weighting methods for complex samples, and (3) identify the cost of informal care per client (in number of hours of care provided by unpaid carers) and the drivers of such costs. We will use design-based survey methods for the estimation of prevalence and generalized estimating equations for regression models and correlated and longitudinal data. Results: The results will provide much needed statistics regarding dementia prevalence and risk factors and the cost of informal care for people living with dementia in New Zealand. Potential health inequities for different ethnic groups will be highlighted, which can then be used by decision makers to inform the development of policy and practice. Conclusions: As of November 2020, there were no dementia prevalence studies or studies on informal care costs of dementia using national data from New Zealand. All existing studies have used data from other populations with substantially different demographic distributions. This study will give insight into the actual prevalence, risk factors, and informal care costs of dementia for the population with support needs in New Zealand. It will provide valuable information to improve health outcomes and better inform policy and planning. International Registered Report Identifier (IRRID): DERR1-10.2196/20225 ", doi="10.2196/20225", url="https://www.researchprotocols.org/2021/1/e20225", url="http://www.ncbi.nlm.nih.gov/pubmed/33404510" } @Article{info:doi/10.2196/23518, author="Jimenez, Jimenez Alberto and Estevez-Reboredo, M. Rosa and Santed, A. Miguel and Ramos, Victoria", title="COVID-19 Symptom-Related Google Searches and Local COVID-19 Incidence in Spain: Correlational Study", journal="J Med Internet Res", year="2020", month="Dec", day="18", volume="22", number="12", pages="e23518", keywords="behavioral epidemiology", keywords="big data", keywords="smart data", keywords="tracking", keywords="nowcasting", keywords="forecast", keywords="predict", keywords="infosurveillance", keywords="infodemiology", keywords="COVID-19", abstract="Background: COVID-19 is one of the biggest pandemics in human history, along with other disease pandemics, such as the H1N1 influenza A, bubonic plague, and smallpox pandemics. This study is a small contribution that tries to find contrasted formulas to alleviate global suffering and guarantee a more manageable future. Objective: In this study, a statistical approach was proposed to study the correlation between the incidence of COVID-19 in Spain and search data provided by Google Trends. Methods: We assessed the linear correlation between Google Trends search data and the data provided by the National Center of Epidemiology in Spain---which is dependent on the Instituto de Salud Carlos III---regarding the number of COVID-19 cases reported with a certain time lag. These data enabled the identification of anticipatory patterns. Results: In response to the ongoing outbreak, our results demonstrate that by using our correlation test, the evolution of the COVID-19 pandemic can be predicted in Spain up to 11 days in advance. Conclusions: During the epidemic, Google Trends offers the possibility to preempt health care decisions in real time by tracking people's concerns through their search patterns. This can be of great help given the critical, if not dramatic need for complementary monitoring approaches that work on a population level and inform public health decisions in real time. This study of Google search patterns, which was motivated by the fears of individuals in the face of a pandemic, can be useful in anticipating the development of the pandemic. ", doi="10.2196/23518", url="http://www.jmir.org/2020/12/e23518/", url="http://www.ncbi.nlm.nih.gov/pubmed/33156803" } @Article{info:doi/10.2196/24432, author="Li, Zhenlong and Li, Xiaoming and Porter, Dwayne and Zhang, Jiajia and Jiang, Yuqin and Olatosi, Bankole and Weissman, Sharon", title="Monitoring the Spatial Spread of COVID-19 and Effectiveness of Control Measures Through Human Movement Data: Proposal for a Predictive Model Using Big Data Analytics", journal="JMIR Res Protoc", year="2020", month="Dec", day="18", volume="9", number="12", pages="e24432", keywords="big data", keywords="human movement", keywords="spatial computing", keywords="COVID-19", keywords="artificial intelligence", abstract="Background: Human movement is one of the forces that drive the spatial spread of infectious diseases. To date, reducing and tracking human movement during the COVID-19 pandemic has proven effective in limiting the spread of the virus. Existing methods for monitoring and modeling the spatial spread of infectious diseases rely on various data sources as proxies of human movement, such as airline travel data, mobile phone data, and banknote tracking. However, intrinsic limitations of these data sources prevent us from systematic monitoring and analyses of human movement on different spatial scales (from local to global). Objective: Big data from social media such as geotagged tweets have been widely used in human mobility studies, yet more research is needed to validate the capabilities and limitations of using such data for studying human movement at different geographic scales (eg, from local to global) in the context of global infectious disease transmission. This study aims to develop a novel data-driven public health approach using big data from Twitter coupled with other human mobility data sources and artificial intelligence to monitor and analyze human movement at different spatial scales (from global to regional to local). Methods: We will first develop a database with optimized spatiotemporal indexing to store and manage the multisource data sets collected in this project. This database will be connected to our in-house Hadoop computing cluster for efficient big data computing and analytics. We will then develop innovative data models, predictive models, and computing algorithms to effectively extract and analyze human movement patterns using geotagged big data from Twitter and other human mobility data sources, with the goal of enhancing situational awareness and risk prediction in public health emergency response and disease surveillance systems. Results: This project was funded as of May 2020. We have started the data collection, processing, and analysis for the project. Conclusions: Research findings can help government officials, public health managers, emergency responders, and researchers answer critical questions during the pandemic regarding the current and future infectious risk of a state, county, or community and the effectiveness of social/physical distancing practices in curtailing the spread of the virus. International Registered Report Identifier (IRRID): DERR1-10.2196/24432 ", doi="10.2196/24432", url="http://www.researchprotocols.org/2020/12/e24432/", url="http://www.ncbi.nlm.nih.gov/pubmed/33301418" } @Article{info:doi/10.2196/20783, author="Schmit, Cason and Ajayi, V. Kobi and Ferdinand, O. Alva and Giannouchos, Theodoros and Ilangovan, Gurudev and Nowell, Benjamin W. and Kum, Hye-Chung", title="Communicating With Patients About Software for Enhancing Privacy in Secondary Database Research Involving Record Linkage: Delphi Study", journal="J Med Internet Res", year="2020", month="Dec", day="15", volume="22", number="12", pages="e20783", keywords="Delphi technique", keywords="privacy", keywords="communication barriers", keywords="medical record linkage", keywords="research subjects", keywords="big data", abstract="Background: There is substantial prior research on the perspectives of patients on the use of health information for research. Numerous communication barriers challenge transparency between researchers and data participants in secondary database research (eg, waiver of informed consent and knowledge gaps). Individual concerns and misconceptions challenge the trust in researchers among patients despite efforts to protect data. Technical software used to protect research data can further complicate the public's understanding of research. For example, MiNDFIRL (Minimum Necessary Disclosure For Interactive Record Linkage) is a prototype software that can be used to enhance the confidentiality of data sets by restricting disclosures of identifying information during the record linkage process. However, software, such as MiNDFIRL, which is used to protect data, must overcome the aforementioned communication barriers. One proposed solution is the creation of an interactive web-based frequently asked question (FAQ) template that can be adapted and used to communicate research issues to data subjects. Objective: This study aims to improve communication with patients and transparency about how complex software, such as MiNDFIRL, is used to enhance privacy in secondary database studies to maintain the public's trust in researchers. Methods: A Delphi technique with 3 rounds of the survey was used to develop the FAQ document to communicate privacy issues related to a generic secondary database study using the MiNDFIRL software. The Delphi panel consisted of 38 patients with chronic health conditions. We revised the FAQ between Delphi rounds and provided participants with a summary of the feedback. We adopted a conservative consensus threshold of less than 10\% negative feedback per FAQ section. Results: We developed a consensus language for 21 of the 24 FAQ sections. Participant feedback demonstrated preference differences (eg, brevity vs comprehensiveness). We adapted the final FAQ into an interactive web-based format that 94\% (31/33) of the participants found helpful or very helpful. The template FAQ and MiNDFIRL source code are available on GitHub. The results indicate the following patient communication considerations: patients have diverse and varied preferences; the tone is important but challenging; and patients want information on security, identifiers, and final disposition of information. Conclusions: The findings of this study provide insights into what research-related information is useful to patients and how researchers can communicate such information. These findings align with the current understanding of health literacy and its challenges. Communication is essential to transparency and ethical data use, yet it is exceedingly challenging. Developing FAQ template language to accompany a complex software may enable researchers to provide greater transparency when informed consent is not possible. ", doi="10.2196/20783", url="http://www.jmir.org/2020/12/e20783/", url="http://www.ncbi.nlm.nih.gov/pubmed/33320097" } @Article{info:doi/10.2196/17892, author="Chishtie, Ahmed Jawad and Marchand, Jean-Sebastien and Turcotte, A. Luke and Bielska, Anna Iwona and Babineau, Jessica and Cepoiu-Martin, Monica and Irvine, Michael and Munce, Sarah and Abudiab, Sally and Bjelica, Marko and Hossain, Saima and Imran, Muhammad and Jeji, Tara and Jaglal, Susan", title="Visual Analytic Tools and Techniques in Population Health and Health Services Research: Scoping Review", journal="J Med Internet Res", year="2020", month="Dec", day="3", volume="22", number="12", pages="e17892", keywords="visual analytics", keywords="machine learning", keywords="data visualization", keywords="data mining", keywords="population health", keywords="health services research", keywords="mobile phone", abstract="Background: Visual analytics (VA) promotes the understanding of data with visual, interactive techniques, using analytic and visual engines. The analytic engine includes automated techniques, whereas common visual outputs include flow maps and spatiotemporal hot spots. Objective: This scoping review aims to address a gap in the literature, with the specific objective to synthesize literature on the use of VA tools, techniques, and frameworks in interrelated health care areas of population health and health services research (HSR). Methods: Using the 2018 PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses Extension for Scoping Reviews) guidelines, the review focuses on peer-reviewed journal articles and full conference papers from 2005 to March 2019. Two researchers were involved at each step, and another researcher arbitrated disagreements. A comprehensive abstraction platform captured data from diverse bodies of the literature, primarily from the computer and health sciences. Results: After screening 11,310 articles, findings from 55 articles were synthesized under the major headings of visual and analytic engines, visual presentation characteristics, tools used and their capabilities, application to health care areas, data types and sources, VA frameworks, frameworks used for VA applications, availability and innovation, and co-design initiatives. We found extensive application of VA methods used in areas of epidemiology, surveillance and modeling, health services access, use, and cost analyses. All articles included a distinct analytic and visualization engine, with varying levels of detail provided. Most tools were prototypes, with 5 in use at the time of publication. Seven articles presented methodological frameworks. Toward consistent reporting, we present a checklist, with an expanded definition for VA applications in health care, to assist researchers in sharing research for greater replicability. We summarized the results in a Tableau dashboard. Conclusions: With the increasing availability and generation of big health care data, VA is a fast-growing method applied to complex health care data. What makes VA innovative is its capability to process multiple, varied data sources to demonstrate trends and patterns for exploratory analysis, leading to knowledge generation and decision support. This is the first review to bridge a critical gap in the literature on VA methods applied to the areas of population health and HSR, which further indicates possible avenues for the adoption of these methods in the future. This review is especially important in the wake of COVID-19 surveillance and response initiatives, where many VA products have taken center stage. International Registered Report Identifier (IRRID): RR2-10.2196/14019 ", doi="10.2196/17892", url="https://www.jmir.org/2020/12/e17892", url="http://www.ncbi.nlm.nih.gov/pubmed/33270029" } @Article{info:doi/10.2196/19679, author="Jeong, Seung-Hyun and Lee, Rim Tae and Kang, Bae Jung and Choi, Mun-Taek", title="Analysis of Health Insurance Big Data for Early Detection of Disabilities: Algorithm Development and Validation", journal="JMIR Med Inform", year="2020", month="Nov", day="23", volume="8", number="11", pages="e19679", keywords="early detection of disabilities", keywords="health insurance", keywords="big data", keywords="feature selection", keywords="classification", abstract="Background: Early detection of childhood developmental delays is very important for the treatment of disabilities. Objective: To investigate the possibility of detecting childhood developmental delays leading to disabilities before clinical registration by analyzing big data from a health insurance database. Methods: In this study, the data from children, individuals aged up to 13 years (n=2412), from the Sample Cohort 2.0 DB of the Korea National Health Insurance Service were organized by age range. Using 6 categories (having no disability, having a physical disability, having a brain lesion, having a visual impairment, having a hearing impairment, and having other conditions), features were selected in the order of importance with a tree-based model. We used multiple classification algorithms to find the best model for each age range. The earliest age range with clinically significant performance showed the age at which conditions can be detected early. Results: The disability detection model showed that it was possible to detect disabilities with significant accuracy even at the age of 4 years, about a year earlier than the mean diagnostic age of 4.99 years. Conclusions: Using big data analysis, we discovered the possibility of detecting disabilities earlier than clinical diagnoses, which would allow us to take appropriate action to prevent disabilities. ", doi="10.2196/19679", url="http://medinform.jmir.org/2020/11/e19679/", url="http://www.ncbi.nlm.nih.gov/pubmed/33226352" } @Article{info:doi/10.2196/21931, author="Lin, Min-Qiang and Lian, Chen-Lu and Zhou, Ping and Lei, Jian and Wang, Jun and Hua, Li and Zhou, Juan and Wu, San-Gang", title="Analysis of the Trends in Publications on Clinical Cancer Research in Mainland China from the Surveillance, Epidemiology, and End Results (SEER) Database: Bibliometric Study", journal="JMIR Med Inform", year="2020", month="Nov", day="17", volume="8", number="11", pages="e21931", keywords="cancer", keywords="China", keywords="data collection", keywords="bibliometrics", keywords="PubMed", keywords="SEER program", abstract="Background: The application of China's big data sector in cancer research is just the beginning. In recent decades, more and more Chinese scholars have used the Surveillance, Epidemiology, and End Results (SEER) database for clinical cancer research. A comprehensive bibliometric study is required to analyze the tendency of Chinese scholars to utilize the SEER database for clinical cancer research and provide a reference for the future of big data analytics. Objective: Our study aimed to assess the trend of publications on clinical cancer research in mainland China from the SEER database. Methods: We performed a PubMed search to identify papers published with data from the SEER database in mainland China until August 31, 2020. Results: A total of 1566 papers utilizing the SEER database that were authored by investigators in mainland China were identified. Over the past years, significant growth in studies based on the SEER database was observed (P<.001). The top 5 research topics were breast cancer (213/1566, 13.6\%), followed by colorectal cancer (185/1566, 11.8\%), lung cancer (179/1566, 11.4\%), gastrointestinal cancer (excluding colorectal cancer; 149/1566, 9.5\%), and genital system cancer (93/1566, 5.9\%). Approximately 75.2\% (1178/1566) of papers were published from the eastern coastal region of China, and Fudan University Shanghai Cancer Center (Shanghai, China) was the most active organization. Overall, 267 journals were analyzed in this study, of which Oncotarget was the most contributing journal (136/267, 50.9\%). Of the 1566 papers studied, 585 (37.4\%) were published in the second quartile, 489 (31.2\%) in the third quartile, 312 (19.9\%) in the first quartile, and 80 (5.1\%) in the fourth quartile, with 100 (6.4\%) having an unknown Journal Citation Reports ranking. Conclusions: Clinical cancer research based on the SEER database in mainland China underwent constant and rapid growth during recent years. High-quality and comprehensive cancer databases based on Chinese demographic data are urgently needed. ", doi="10.2196/21931", url="http://medinform.jmir.org/2020/11/e21931/", url="http://www.ncbi.nlm.nih.gov/pubmed/33200992" } @Article{info:doi/10.2196/21801, author="Izquierdo, Luis Jose and Ancochea, Julio and and Soriano, B. Joan", title="Clinical Characteristics and Prognostic Factors for Intensive Care Unit Admission of Patients With COVID-19: Retrospective Study Using Machine Learning and Natural Language Processing", journal="J Med Internet Res", year="2020", month="Oct", day="28", volume="22", number="10", pages="e21801", keywords="artificial intelligence", keywords="big data", keywords="COVID-19", keywords="electronic health records", keywords="tachypnea", keywords="SARS-CoV-2", keywords="predictive model", abstract="Background: Many factors involved in the onset and clinical course of the ongoing COVID-19 pandemic are still unknown. Although big data analytics and artificial intelligence are widely used in the realms of health and medicine, researchers are only beginning to use these tools to explore the clinical characteristics and predictive factors of patients with COVID-19. Objective: Our primary objectives are to describe the clinical characteristics and determine the factors that predict intensive care unit (ICU) admission of patients with COVID-19. Determining these factors using a well-defined population can increase our understanding of the real-world epidemiology of the disease. Methods: We used a combination of classic epidemiological methods, natural language processing (NLP), and machine learning (for predictive modeling) to analyze the electronic health records (EHRs) of patients with COVID-19. We explored the unstructured free text in the EHRs within the Servicio de Salud de Castilla-La Mancha (SESCAM) Health Care Network (Castilla-La Mancha, Spain) from the entire population with available EHRs (1,364,924 patients) from January 1 to March 29, 2020. We extracted related clinical information regarding diagnosis, progression, and outcome for all COVID-19 cases. Results: A total of 10,504 patients with a clinical or polymerase chain reaction--confirmed diagnosis of COVID-19 were identified; 5519 (52.5\%) were male, with a mean age of 58.2 years (SD 19.7). Upon admission, the most common symptoms were cough, fever, and dyspnea; however, all three symptoms occurred in fewer than half of the cases. Overall, 6.1\% (83/1353) of hospitalized patients required ICU admission. Using a machine-learning, data-driven algorithm, we identified that a combination of age, fever, and tachypnea was the most parsimonious predictor of ICU admission; patients younger than 56 years, without tachypnea, and temperature <39 degrees Celsius (or >39 {\textordmasculine}C without respiratory crackles) were not admitted to the ICU. In contrast, patients with COVID-19 aged 40 to 79 years were likely to be admitted to the ICU if they had tachypnea and delayed their visit to the emergency department after being seen in primary care. Conclusions: Our results show that a combination of easily obtainable clinical variables (age, fever, and tachypnea with or without respiratory crackles) predicts whether patients with COVID-19 will require ICU admission. ", doi="10.2196/21801", url="http://www.jmir.org/2020/10/e21801/", url="http://www.ncbi.nlm.nih.gov/pubmed/33090964" } @Article{info:doi/10.2196/13567, author="Bhavnani, K. Suresh and Dang, Bryant and Penton, Rebekah and Visweswaran, Shyam and Bassler, E. Kevin and Chen, Tianlong and Raji, Mukaila and Divekar, Rohit and Zuhour, Raed and Karmarkar, Amol and Kuo, Yong-Fang and Ottenbacher, J. Kenneth", title="How High-Risk Comorbidities Co-Occur in Readmitted Patients With Hip Fracture: Big Data Visual Analytical Approach", journal="JMIR Med Inform", year="2020", month="Oct", day="26", volume="8", number="10", pages="e13567", keywords="unplanned hospital readmission", keywords="visual analytics", keywords="bipartite networks", keywords="precision medicine", abstract="Background: When older adult patients with hip fracture (HFx) have unplanned hospital readmissions within 30 days of discharge, it doubles their 1-year mortality, resulting in substantial personal and financial burdens. Although such unplanned readmissions are predominantly caused by reasons not related to HFx surgery, few studies have focused on how pre-existing high-risk comorbidities co-occur within and across subgroups of patients with HFx. Objective: This study aims to use a combination of supervised and unsupervised visual analytical methods to (1) obtain an integrated understanding of comorbidity risk, comorbidity co-occurrence, and patient subgroups, and (2) enable a team of clinical and methodological stakeholders to infer the processes that precipitate unplanned hospital readmission, with the goal of designing targeted interventions. Methods: We extracted a training data set consisting of 16,886 patients (8443 readmitted patients with HFx and 8443 matched controls) and a replication data set consisting of 16,222 patients (8111 readmitted patients with HFx and 8111 matched controls) from the 2010 and 2009 Medicare database, respectively. The analyses consisted of a supervised combinatorial analysis to identify and replicate combinations of comorbidities that conferred significant risk for readmission, an unsupervised bipartite network analysis to identify and replicate how high-risk comorbidity combinations co-occur across readmitted patients with HFx, and an integrated visualization and analysis of comorbidity risk, comorbidity co-occurrence, and patient subgroups to enable clinician stakeholders to infer the processes that precipitate readmission in patient subgroups and to propose targeted interventions. Results: The analyses helped to identify (1) 11 comorbidity combinations that conferred significantly higher risk (ranging from P<.001 to P=.01) for a 30-day readmission, (2) 7 biclusters of patients and comorbidities with a significant bicluster modularity (P<.001; Medicare=0.440; random mean 0.383 [0.002]), indicating strong heterogeneity in the comorbidity profiles of readmitted patients, and (3) inter- and intracluster risk associations, which enabled clinician stakeholders to infer the processes involved in the exacerbation of specific combinations of comorbidities leading to readmission in patient subgroups. Conclusions: The integrated analysis of risk, co-occurrence, and patient subgroups enabled the inference of processes that precipitate readmission, leading to a comorbidity exacerbation risk model for readmission after HFx. These results have direct implications for (1) the management of comorbidities targeted at high-risk subgroups of patients with the goal of pre-emptively reducing their risk of readmission and (2) the development of more accurate risk prediction models that incorporate information about patient subgroups. ", doi="10.2196/13567", url="https://medinform.jmir.org/2020/10/e13567", url="http://www.ncbi.nlm.nih.gov/pubmed/33103657" } @Article{info:doi/10.2196/19810, author="Afzal, Muhammad and Alam, Fakhare and Malik, Mahmood Khalid and Malik, M. Ghaus", title="Clinical Context--Aware Biomedical Text Summarization Using Deep Neural Network: Model Development and Validation", journal="J Med Internet Res", year="2020", month="Oct", day="23", volume="22", number="10", pages="e19810", keywords="biomedical informatics", keywords="automatic text summarization", keywords="deep neural network", keywords="word embedding", keywords="semantic similarity", keywords="brain aneurysm", abstract="Background: Automatic text summarization (ATS) enables users to retrieve meaningful evidence from big data of biomedical repositories to make complex clinical decisions. Deep neural and recurrent networks outperform traditional machine-learning techniques in areas of natural language processing and computer vision; however, they are yet to be explored in the ATS domain, particularly for medical text summarization. Objective: Traditional approaches in ATS for biomedical text suffer from fundamental issues such as an inability to capture clinical context, quality of evidence, and purpose-driven selection of passages for the summary. We aimed to circumvent these limitations through achieving precise, succinct, and coherent information extraction from credible published biomedical resources, and to construct a simplified summary containing the most informative content that can offer a review particular to clinical needs. Methods: In our proposed approach, we introduce a novel framework, termed Biomed-Summarizer, that provides quality-aware Patient/Problem, Intervention, Comparison, and Outcome (PICO)-based intelligent and context-enabled summarization of biomedical text. Biomed-Summarizer integrates the prognosis quality recognition model with a clinical context--aware model to locate text sequences in the body of a biomedical article for use in the final summary. First, we developed a deep neural network binary classifier for quality recognition to acquire scientifically sound studies and filter out others. Second, we developed a bidirectional long-short term memory recurrent neural network as a clinical context--aware classifier, which was trained on semantically enriched features generated using a word-embedding tokenizer for identification of meaningful sentences representing PICO text sequences. Third, we calculated the similarity between query and PICO text sequences using Jaccard similarity with semantic enrichments, where the semantic enrichments are obtained using medical ontologies. Last, we generated a representative summary from the high-scoring PICO sequences aggregated by study type, publication credibility, and freshness score. Results: Evaluation of the prognosis quality recognition model using a large dataset of biomedical literature related to intracranial aneurysm showed an accuracy of 95.41\% (2562/2686) in terms of recognizing quality articles. The clinical context--aware multiclass classifier outperformed the traditional machine-learning algorithms, including support vector machine, gradient boosted tree, linear regression, K-nearest neighbor, and na{\"i}ve Bayes, by achieving 93\% (16127/17341) accuracy for classifying five categories: aim, population, intervention, results, and outcome. The semantic similarity algorithm achieved a significant Pearson correlation coefficient of 0.61 (0-1 scale) on a well-known BIOSSES dataset (with 100 pair sentences) after semantic enrichment, representing an improvement of 8.9\% over baseline Jaccard similarity. Finally, we found a highly positive correlation among the evaluations performed by three domain experts concerning different metrics, suggesting that the automated summarization is satisfactory. Conclusions: By employing the proposed method Biomed-Summarizer, high accuracy in ATS was achieved, enabling seamless curation of research evidence from the biomedical literature to use for clinical decision-making. ", doi="10.2196/19810", url="http://www.jmir.org/2020/10/e19810/", url="http://www.ncbi.nlm.nih.gov/pubmed/33095174" } @Article{info:doi/10.2196/16779, author="Wehrens, Rik and Sihag, Vikrant and S{\"u}lz, Sandra and van Elten, Hilco and van Raaij, Erik and de Bont, Antoinette and Weggelaar-Jansen, Marie Anne", title="Understanding the Uptake of Big Data in Health Care: Protocol for a Multinational Mixed-Methods Study", journal="JMIR Res Protoc", year="2020", month="Oct", day="22", volume="9", number="10", pages="e16779", keywords="big data", keywords="performance", keywords="business modeling", keywords="regulation", keywords="implementation", keywords="innovation", keywords="social sciences", keywords="legitimacy", keywords="governmental regulation", keywords="balance score card", keywords="ethics", abstract="Background: Despite the high potential of big data, their applications in health care face many organizational, social, financial, and regulatory challenges. The societal dimensions of big data are underrepresented in much medical research. Little is known about integrating big data applications in the corporate routines of hospitals and other care providers. Equally little is understood about embedding big data applications in daily work practices and how they lead to actual improvements for health care actors, such as patients, care professionals, care providers, information technology companies, payers, and the society. Objective: This planned study aims to provide an integrated analysis of big data applications, focusing on the interrelations among concrete big data experiments, organizational routines, and relevant systemic and societal dimensions. To understand the similarities and differences between interactions in various contexts, the study covers 12 big data pilot projects in eight European countries, each with its own health care system. Workshops will be held with stakeholders to discuss the findings, our recommendations, and the implementation. Dissemination is supported by visual representations developed to share the knowledge gained. Methods: This study will utilize a mixed-methods approach that combines performance measurements, interviews, document analysis, and cocreation workshops. Analysis will be structured around the following four key dimensions: performance, embedding, legitimation, and value creation. Data and their interrelations across the dimensions will be synthesized per application and per country. Results: The study was funded in August 2017. Data collection started in April 2018 and will continue until September 2021. The multidisciplinary focus of this study enables us to combine insights from several social sciences (health policy analysis, business administration, innovation studies, organization studies, ethics, and health services research) to advance a holistic understanding of big data value realization. The multinational character enables comparative analysis across the following eight European countries: Austria, France, Germany, Ireland, the Netherlands, Spain, Sweden, and the United Kingdom. Given that national and organizational contexts change over time, it will not be possible to isolate the factors and actors that explain the implementation of big data applications. The visual representations developed for dissemination purposes will help to reduce complexity and clarify the relations between the various dimensions. Conclusions: This study will develop an integrated approach to big data applications that considers the interrelations among concrete big data experiments, organizational routines, and relevant systemic and societal dimensions. International Registered Report Identifier (IRRID): DERR1-10.2196/16779 ", doi="10.2196/16779", url="http://www.researchprotocols.org/2020/10/e16779/", url="http://www.ncbi.nlm.nih.gov/pubmed/33090113" } @Article{info:doi/10.2196/21081, author="Tosi, Davide and Campi, Alessandro", title="How Data Analytics and Big Data Can Help Scientists in Managing COVID-19 Diffusion: Modeling Study to Predict the COVID-19 Diffusion in Italy and the Lombardy Region", journal="J Med Internet Res", year="2020", month="Oct", day="14", volume="22", number="10", pages="e21081", keywords="COVID-19", keywords="SARS-CoV-2", keywords="big data", keywords="data analytics", keywords="predictive models", keywords="prediction", keywords="modeling", keywords="Italy", keywords="diffusion", abstract="Background: COVID-19 is the most widely discussed topic worldwide in 2020, and at the beginning of the Italian epidemic, scientists tried to understand the virus diffusion and the epidemic curve of positive cases with controversial findings and numbers. Objective: In this paper, a data analytics study on the diffusion of COVID-19 in Italy and the Lombardy Region is developed to define a predictive model tailored to forecast the evolution of the diffusion over time. Methods: Starting with all available official data collected worldwide about the diffusion of COVID-19, we defined a predictive model at the beginning of March 2020 for the Italian country. Results: This paper aims at showing how this predictive model was able to forecast the behavior of the COVID-19 diffusion and how it predicted the total number of positive cases in Italy over time. The predictive model forecasted, for the Italian country, the end of the COVID-19 first wave by the beginning of June. Conclusions: This paper shows that big data and data analytics can help medical experts and epidemiologists in promptly designing accurate and generalized models to predict the different COVID-19 evolutionary phases in other countries and regions, and for second and third possible epidemic waves. ", doi="10.2196/21081", url="http://www.jmir.org/2020/10/e21081/", url="http://www.ncbi.nlm.nih.gov/pubmed/33027038" } @Article{info:doi/10.2196/17962, author="Cecchetti, A. Alfred and Bhardwaj, Niharika and Murughiyan, Usha and Kothakapu, Gouthami and Sundaram, Uma", title="Fueling Clinical and Translational Research in Appalachia: Informatics Platform Approach", journal="JMIR Med Inform", year="2020", month="Oct", day="14", volume="8", number="10", pages="e17962", keywords="Appalachian region", keywords="medical informatics", keywords="health care disparities", keywords="electronic health records", keywords="data warehousing", keywords="data mining", keywords="data visualization", keywords="machine learning", keywords="data science", abstract="Background: The Appalachian population is distinct, not just culturally and geographically but also in its health care needs, facing the most health care disparities in the United States. To meet these unique demands, Appalachian medical centers need an arsenal of analytics and data science tools with the foundation of a centralized data warehouse to transform health care data into actionable clinical interventions. However, this is an especially challenging task given the fragmented state of medical data within Appalachia and the need for integration of other types of data such as environmental, social, and economic with medical data. Objective: This paper aims to present the structure and process of the development of an integrated platform at a midlevel Appalachian academic medical center along with its initial uses. Methods: The Appalachian Informatics Platform was developed by the Appalachian Clinical and Translational Science Institute's Division of Clinical Informatics and consists of 4 major components: a centralized clinical data warehouse, modeling (statistical and machine learning), visualization, and model evaluation. Data from different clinical systems, billing systems, and state- or national-level data sets were integrated into a centralized data warehouse. The platform supports research efforts by enabling curation and analysis of data using the different components, as appropriate. Results: The Appalachian Informatics Platform is functional and has supported several research efforts since its implementation for a variety of purposes, such as increasing knowledge of the pathophysiology of diseases, risk identification, risk prediction, and health care resource utilization research and estimation of the economic impact of diseases. Conclusions: The platform provides an inexpensive yet seamless way to translate clinical and translational research ideas into clinical applications for regions similar to Appalachia that have limited resources and a largely rural population. ", doi="10.2196/17962", url="http://medinform.jmir.org/2020/10/e17962/", url="http://www.ncbi.nlm.nih.gov/pubmed/33052114" } @Article{info:doi/10.2196/21980, author="Wu, Jun and Wang, Jian and Nicholas, Stephen and Maitland, Elizabeth and Fan, Qiuyan", title="Application of Big Data Technology for COVID-19 Prevention and Control in China: Lessons and Recommendations", journal="J Med Internet Res", year="2020", month="Oct", day="9", volume="22", number="10", pages="e21980", keywords="big data", keywords="COVID-19", keywords="disease prevention and control", abstract="Background: In the prevention and control of infectious diseases, previous research on the application of big data technology has mainly focused on the early warning and early monitoring of infectious diseases. Although the application of big data technology for COVID-19 warning and monitoring remain important tasks, prevention of the disease's rapid spread and reduction of its impact on society are currently the most pressing challenges for the application of big data technology during the COVID-19 pandemic. After the outbreak of COVID-19 in Wuhan, the Chinese government and nongovernmental organizations actively used big data technology to prevent, contain, and control the spread of COVID-19. Objective: The aim of this study is to discuss the application of big data technology to prevent, contain, and control COVID-19 in China; draw lessons; and make recommendations. Methods: We discuss the data collection methods and key data information that existed in China before the outbreak of COVID-19 and how these data contributed to the prevention and control of COVID-19. Next, we discuss China's new data collection methods and new information assembled after the outbreak of COVID-19. Based on the data and information collected in China, we analyzed the application of big data technology from the perspectives of data sources, data application logic, data application level, and application results. In addition, we analyzed the issues, challenges, and responses encountered by China in the application of big data technology from four perspectives: data access, data use, data sharing, and data protection. Suggestions for improvements are made for data collection, data circulation, data innovation, and data security to help understand China's response to the epidemic and to provide lessons for other countries' prevention and control of COVID-19. Results: In the process of the prevention and control of COVID-19 in China, big data technology has played an important role in personal tracking, surveillance and early warning, tracking of the virus's sources, drug screening, medical treatment, resource allocation, and production recovery. The data used included location and travel data, medical and health data, news media data, government data, online consumption data, data collected by intelligent equipment, and epidemic prevention data. We identified a number of big data problems including low efficiency of data collection, difficulty in guaranteeing data quality, low efficiency of data use, lack of timely data sharing, and data privacy protection issues. To address these problems, we suggest unified data collection standards, innovative use of data, accelerated exchange and circulation of data, and a detailed and rigorous data protection system. Conclusions: China has used big data technology to prevent and control COVID-19 in a timely manner. To prevent and control infectious diseases, countries must collect, clean, and integrate data from a wide range of sources; use big data technology to analyze a wide range of big data; create platforms for data analyses and sharing; and address privacy issues in the collection and use of big data. ", doi="10.2196/21980", url="http://www.jmir.org/2020/10/e21980/", url="http://www.ncbi.nlm.nih.gov/pubmed/33001836" } @Article{info:doi/10.2196/20558, author="Rao, Qingmao and Zhang, Zuyue and Lv, Yalan and Zhao, Yong and Bai, Li and Hou, Xiaorong", title="Factors Associated With Influential Health-Promoting Messages on Social Media: Content Analysis of Sina Weibo", journal="JMIR Med Inform", year="2020", month="Oct", day="9", volume="8", number="10", pages="e20558", keywords="health-promoting messages", keywords="social media", keywords="Sina Weibo", keywords="influence", keywords="framing effects", keywords="health communication", abstract="Background: Social media is a powerful tool for the dissemination of health messages. However, few studies have focused on the factors that improve the influence of health messages on social media. Objective: To explore the influence of goal-framing effects, information organizing, and the use of pictures or videos in health-promoting messages, we conducted a case study of Sina Weibo, a popular social media platform in China. Methods: Literature review and expert discussion were used to determine the health themes of childhood obesity, smoking, and cancer. Web crawler technology was employed to capture data on health-promoting messages. We used the number of retweets, comments, and likes to evaluate the influence of a message. Statistical analysis was then conducted after manual coding. Specifically, binary logistic regression was used for the data analyses. Results: We crawled 20,799 Sina Weibo messages and selected 389 health-promoting messages for this study. Results indicated that the use of gain-framed messages could improve the influence of messages regarding childhood obesity (P<.001), smoking (P=.03), and cancer (P<.001). Statistical expressions could improve the influence of messages about childhood obesity (P=.02), smoking (P=.002), and cancer (P<.001). However, the use of videos significantly improved the influence of health-promoting messages only for the smoking-related messages (P=.009). Conclusions: The findings suggested that gain-framed messages and statistical expressions can be successful strategies to improve the influence of messages. Moreover, appropriate pictures and videos should be added as much as possible when generating health-promoting messages. ", doi="10.2196/20558", url="http://medinform.jmir.org/2020/10/e20558/", url="http://www.ncbi.nlm.nih.gov/pubmed/33034569" } @Article{info:doi/10.2196/18780, author="Li, Yazi and Lu, Chunji and Liu, Yang", title="Medical Insurance Information Systems in China: Mixed Methods Study", journal="JMIR Med Inform", year="2020", month="Sep", day="1", volume="8", number="9", pages="e18780", keywords="medical insurance", keywords="medical insurance information system", keywords="health information exchange", keywords="information infrastructure", keywords="big data", keywords="policy review", keywords="privacy protection", abstract="Background: Since the People's Republic of China (PRC), or China, established the basic medical insurance system (MIS) in 1998, the medical insurance information systems (MIISs) in China have effectively supported the operation of the MIS through several phases of development; the phases included a stand-alone version, the internet, and big data. In 2018, China's national medical security systems were integrated, while MIISs were facing reconstruction. We summarized China's experience in medical insurance informatization over the past 20 years, aiming to provide a reference for the building of a new basic MIS for China and for developing countries. Objective: This paper aims to sort out medical insurance informatization policies throughout the years, use questionnaires to determine the status quo of provincial MIIS-building in China and the relevant policies, provide references and suggestions for the top-level design and implementation of the information systems in the transitional period of China's MIS reform, and provide a reference for the building of MIISs in developing countries. Methods: We conducted policy analysis by collecting the laws, regulations, and policy documents---issued from 1998 to 2020---on China's medical insurance and its informatization; we also analyzed the US Health Insurance Portability and Accountability Act and other relevant policies. We conducted a questionnaire survey by sending out questionnaires to 31 Chinese, provincial, medical security bureaus to collect information about network links, system functions, data exchange, standards and specifications, and building modes, among other items. We conducted a literature review by searching for documents about relevant laws and policies, building methods, application results, and other documents related to MIISs; we conducted searches using PubMed, Elsevier, China National Knowledge Infrastructure, and other major literature databases. We conducted telephone interviews to verify the results of questionnaires and to understand the focus issues concerning the building of China's national MIISs during the period of integration and transition of China's MIS. Results: In 74\% (23/31) of the regions in China, MIISs were networked through dedicated fiber optic lines. In 65\% (20/31) of the regions in China, MIISs supported identity recognition based on both ID cards and social security cards. In 55\% (17/31) of the regions in China, MIISs at provincial and municipal levels were networked and have gathered basic medical insurance data, whereas MIISs were connected to health insurance companies in 35\% (11/31) of the regions in China. China's MIISs are comprised of 11 basic functional modules, among which the modules of business operation, transregional referral, reimbursement, and monitoring systems are widely applied. MIISs in 83\% (20/24) of Chinese provinces have stored data on coverage, payment, and settlement compensation of medical insurance. However, in terms of data security and privacy protection, pertinent policies are absent and data utilization is not in-depth enough. Respondents to telephone interviews universally reflected on the following issues and suggestions: in the period of integration and transition of MISs, close attention should be paid to the top-level design, and repeated investment should be avoided for the building of MIISs; MIISs should be adapted to the health care reform, and efforts should be made to strengthen the informatization support for the reform of payment methods; and MIISs should be adapted for the widespread application of mobile phones and should provide insured persons with more self-service functions. Conclusions: In the future, the building of China's basic MIISs should be deployed at the national, provincial, prefectural, and municipal levels on a unified basis. Efforts should be made to strengthen the development of standard codes, data exchange, and data utilization. Work should be done to formulate the rules and regulations for security and privacy protection and to balance the right to be informed with the mining and utilization of big data. Efforts should be made to intensify the interconnectivity between MISs and other health systems and to strengthen the application of medical insurance information in public health monitoring and early warning systems; this would ultimately improve the degree of trust from stakeholders, including individuals, medical service providers, and public health institutions, in the basic MIISs. ", doi="10.2196/18780", url="https://medinform.jmir.org/2020/9/e18780", url="http://www.ncbi.nlm.nih.gov/pubmed/32673209" } @Article{info:doi/10.2196/18150, author="Kardas, Przemyslaw and Aguilar-Palacio, Isabel and Almada, Marta and Cahir, Caitriona and Costa, Elisio and Giardini, Anna and Malo, Sara and Massot Mesquida, Mireia and Menditto, Enrica and Mid{\~a}o, Lu{\'i}s and Parra-Calder{\'o}n, Luis Carlos and Pepiol Salom, Enrique and Vrijens, Bernard", title="The Need to Develop Standard Measures of Patient Adherence for Big Data: Viewpoint", journal="J Med Internet Res", year="2020", month="Aug", day="27", volume="22", number="8", pages="e18150", keywords="patient adherence", keywords="big data", keywords="metrics", keywords="consensus", doi="10.2196/18150", url="http://www.jmir.org/2020/8/e18150/", url="http://www.ncbi.nlm.nih.gov/pubmed/32663138" } @Article{info:doi/10.2196/20794, author="Mackey, Ken Tim and Li, Jiawei and Purushothaman, Vidya and Nali, Matthew and Shah, Neal and Bardier, Cortni and Cai, Mingxiang and Liang, Bryan", title="Big Data, Natural Language Processing, and Deep Learning to Detect and Characterize Illicit COVID-19 Product Sales: Infoveillance Study on Twitter and Instagram", journal="JMIR Public Health Surveill", year="2020", month="Aug", day="25", volume="6", number="3", pages="e20794", keywords="COVID-19", keywords="coronavirus", keywords="infectious disease", keywords="social media", keywords="surveillance", keywords="infoveillance", keywords="infodemiology", keywords="infodemic", keywords="fraud", keywords="cybercrime", abstract="Background: The coronavirus disease (COVID-19) pandemic is perhaps the greatest global health challenge of the last century. Accompanying this pandemic is a parallel ``infodemic,'' including the online marketing and sale of unapproved, illegal, and counterfeit COVID-19 health products including testing kits, treatments, and other questionable ``cures.'' Enabling the proliferation of this content is the growing ubiquity of internet-based technologies, including popular social media platforms that now have billions of global users. Objective: This study aims to collect, analyze, identify, and enable reporting of suspected fake, counterfeit, and unapproved COVID-19--related health care products from Twitter and Instagram. Methods: This study is conducted in two phases beginning with the collection of COVID-19--related Twitter and Instagram posts using a combination of web scraping on Instagram and filtering the public streaming Twitter application programming interface for keywords associated with suspect marketing and sale of COVID-19 products. The second phase involved data analysis using natural language processing (NLP) and deep learning to identify potential sellers that were then manually annotated for characteristics of interest. We also visualized illegal selling posts on a customized data dashboard to enable public health intelligence. Results: We collected a total of 6,029,323 tweets and 204,597 Instagram posts filtered for terms associated with suspect marketing and sale of COVID-19 health products from March to April for Twitter and February to May for Instagram. After applying our NLP and deep learning approaches, we identified 1271 tweets and 596 Instagram posts associated with questionable sales of COVID-19--related products. Generally, product introduction came in two waves, with the first consisting of questionable immunity-boosting treatments and a second involving suspect testing kits. We also detected a low volume of pharmaceuticals that have not been approved for COVID-19 treatment. Other major themes detected included products offered in different languages, various claims of product credibility, completely unsubstantiated products, unapproved testing modalities, and different payment and seller contact methods. Conclusions: Results from this study provide initial insight into one front of the ``infodemic'' fight against COVID-19 by characterizing what types of health products, selling claims, and types of sellers were active on two popular social media platforms at earlier stages of the pandemic. This cybercrime challenge is likely to continue as the pandemic progresses and more people seek access to COVID-19 testing and treatment. This data intelligence can help public health agencies, regulatory authorities, legitimate manufacturers, and technology platforms better remove and prevent this content from harming the public. ", doi="10.2196/20794", url="http://publichealth.jmir.org/2020/3/e20794/", url="http://www.ncbi.nlm.nih.gov/pubmed/32750006" } @Article{info:doi/10.2196/22214, author="Mirchev, Martin and Mircheva, Iskra and Kerekovska, Albena", title="The Academic Viewpoint on Patient Data Ownership in the Context of Big Data: Scoping Review", journal="J Med Internet Res", year="2020", month="Aug", day="18", volume="22", number="8", pages="e22214", keywords="big data", keywords="ethics", keywords="legal aspects", keywords="ownership", keywords="patient-generated health data", abstract="Background: The ownership of patient information in the context of big data is a relatively new problem, which is not yet fully recognized by the medical academic community. The problem is interdisciplinary, incorporating legal, ethical, medical, and aspects of information and communication technologies, requiring a sophisticated analysis. However, no previous scoping review has mapped existing studies on the subject. Objective: This study aims to map and assess published studies on patient data ownership in the context of big data as viewed by the academic community. Methods: A scoping review was conducted based on the 5-stage framework outlined by Arksey and O'Malley and further developed by Levac, Colquhoun, and O'Brien. The organization and reporting of results of the scoping review were conducted according to PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses and its extensions for Scoping Reviews). A systematic and comprehensive search of 4 scientific information databases, PubMed, ScienceDirect, Scopus, and Springer, was performed for studies published between January 2000 and October 2019. Two authors independently assessed the eligibility of the studies and the extracted data. Results: The review included 32 eligible articles authored by academicians that correspond to 3 focus areas: problem (ownership), area (health care), and context (big data). Five major aspects were studied: the scientific area of publications, aspects and academicians' perception of ownership in the context of big data, proposed solutions, and practical applications for data ownership issues in the context of big data. The aspects in which publications consider ownership of medical data are not clearly distinguished but can be summarized as ethical, legal, political, and managerial. The ownership of patient data is perceived primarily as a challenge fundamental to conducting medical research, including data sales and sharing, and to a lesser degree as a means of control, problem, threat, and opportunity also in view of medical research. Although numerous solutions falling into 3 categories, technology, law, and policy, were proposed, only 3 real applications were discussed. Conclusions: The issue of ownership of patient information in the context of big data is poorly researched; it is not addressed consistently and in its integrity, and there is no consensus on policy decisions and the necessary legal regulations. Future research should investigate the issue of ownership as a core research question and not as a minor fragment among other topics. More research is needed to increase the body of knowledge regarding the development of adequate policies and relevant legal frameworks in compliance with ethical standards. The combined efforts of multidisciplinary academic teams are needed to overcome existing gaps in the perception of ownership, the aspects of ownership, and the possible solutions to patient data ownership issues in the reality of big data. ", doi="10.2196/22214", url="http://www.jmir.org/2020/8/e22214/", url="http://www.ncbi.nlm.nih.gov/pubmed/32808934" } @Article{info:doi/10.2196/18044, author="Cahan, M. Eli and Khatri, Purvesh", title="Data Heterogeneity: The Enzyme to Catalyze Translational Bioinformatics?", journal="J Med Internet Res", year="2020", month="Aug", day="12", volume="22", number="8", pages="e18044", keywords="medical Informatics", keywords="health equity", keywords="health care disparities", keywords="population health", keywords="quality improvement", keywords="precision medicine", doi="10.2196/18044", url="https://www.jmir.org/2020/8/e18044", url="http://www.ncbi.nlm.nih.gov/pubmed/32784182" } @Article{info:doi/10.2196/17211, author="Iqbal, Usman and Celi, Anthony Leo and Li, Jack Yu-Chuan", title="How Can Artificial Intelligence Make Medicine More Preemptive?", journal="J Med Internet Res", year="2020", month="Aug", day="11", volume="22", number="8", pages="e17211", keywords="artificial intelligence", keywords="digital health", keywords="eHealth", keywords="health care technology", keywords="medical innovations", keywords="health information technology", keywords="advanced care systems", doi="10.2196/17211", url="https://www.jmir.org/2020/8/e17211", url="http://www.ncbi.nlm.nih.gov/pubmed/32780024" } @Article{info:doi/10.2196/17508, author="Ismail, Leila and Materwala, Huned and Karduck, P. Achim and Adem, Abdu", title="Requirements of Health Data Management Systems for Biomedical Care and Research: Scoping Review", journal="J Med Internet Res", year="2020", month="Jul", day="7", volume="22", number="7", pages="e17508", keywords="big data", keywords="blockchain", keywords="data analytics", keywords="eHealth", keywords="electronic medical records", keywords="health care", keywords="health information management", keywords="Internet of Things", keywords="medical research", keywords="mHealth", abstract="Background: Over the last century, disruptive incidents in the fields of clinical and biomedical research have yielded a tremendous change in health data management systems. This is due to a number of breakthroughs in the medical field and the need for big data analytics and the Internet of Things (IoT) to be incorporated in a real-time smart health information management system. In addition, the requirements of patient care have evolved over time, allowing for more accurate prognoses and diagnoses. In this paper, we discuss the temporal evolution of health data management systems and capture the requirements that led to the development of a given system over a certain period of time. Consequently, we provide insights into those systems and give suggestions and research directions on how they can be improved for a better health care system. Objective: This study aimed to show that there is a need for a secure and efficient health data management system that will allow physicians and patients to update decentralized medical records and to analyze the medical data for supporting more precise diagnoses, prognoses, and public insights. Limitations of existing health data management systems were analyzed. Methods: To study the evolution and requirements of health data management systems over the years, a search was conducted to obtain research articles and information on medical lawsuits, health regulations, and acts. These materials were obtained from the Institute of Electrical and Electronics Engineers, the Association for Computing Machinery, Elsevier, MEDLINE, PubMed, Scopus, and Web of Science databases. Results: Health data management systems have undergone a disruptive transformation over the years from paper to computer, web, cloud, IoT, big data analytics, and finally to blockchain. The requirements of a health data management system revealed from the evolving definitions of medical records and their management are (1) medical record data, (2) real-time data access, (3) patient participation, (4) data sharing, (5) data security, (6) patient identity privacy, and (7) public insights. This paper reviewed health data management systems based on these 7 requirements across studies conducted over the years. To our knowledge, this is the first analysis of the temporal evolution of health data management systems giving insights into the system requirements for better health care. Conclusions: There is a need for a comprehensive real-time health data management system that allows physicians, patients, and external users to input their medical and lifestyle data into the system. The incorporation of big data analytics will aid in better prognosis or diagnosis of the diseases and the prediction of diseases. The prediction results will help in the development of an effective prevention plan. ", doi="10.2196/17508", url="https://www.jmir.org/2020/7/e17508", url="http://www.ncbi.nlm.nih.gov/pubmed/32348265" } @Article{info:doi/10.2196/17451, author="Lee, WJ Edmund and Bekalu, Awoke Mesfin and McCloud, Rachel and Vallone, Donna and Arya, Monisha and Osgood, Nathaniel and Li, Xiaoyan and Minsky, Sara and Viswanath, Kasisomayajula", title="The Potential of Smartphone Apps in Informing Protobacco and Antitobacco Messaging Efforts Among Underserved Communities: Longitudinal Observational Study", journal="J Med Internet Res", year="2020", month="Jul", day="7", volume="22", number="7", pages="e17451", keywords="mobile health", keywords="mobile phone", keywords="tobacco use", keywords="big data", keywords="spatial analysis", keywords="data science", abstract="Background: People from underserved communities such as those from lower socioeconomic positions or racial and ethnic minority groups are often disproportionately targeted by the tobacco industry, through the relatively high levels of tobacco retail outlets (TROs) located in their neighborhood or protobacco marketing and promotional strategies. It is difficult to capture the smoking behaviors of individuals in actual locations as well as the extent of exposure to tobacco promotional efforts. With the high ownership of smartphones in the United States---when used alongside data sources on TRO locations---apps could potentially improve tobacco control efforts. Health apps could be used to assess individual-level exposure to tobacco marketing, particularly in relation to the locations of TROs as well as locations where they were most likely to smoke. To date, it remains unclear how health apps could be used practically by health promotion organizations to better reach underserved communities in their tobacco control efforts. Objective: This study aimed to demonstrate how smartphone apps could augment existing data on locations of TROs within underserved communities in Massachusetts and Texas to help inform tobacco control efforts. Methods: Data for this study were collected from 2 sources: (1) geolocations of TROs from the North American Industry Classification System 2016 and (2) 95 participants (aged 18 to 34 years) from underserved communities who resided in Massachusetts and Texas and took part in an 8-week study using location tracking on their smartphones. We analyzed the data using spatial autocorrelation, optimized hot spot analysis, and fitted power-law distribution to identify the TROs that attracted the most human traffic using mobility data. Results: Participants reported encountering protobacco messages mostly from store signs and displays and antitobacco messages predominantly through television. In Massachusetts, clusters of TROs (Dorchester Center and Jamaica Plain) and reported smoking behaviors (Dorchester Center, Roxbury Crossing, Lawrence) were found in economically disadvantaged neighborhoods. Despite the widespread distribution of TROs throughout the communities, participants overwhelmingly visited a relatively small number of TROs in Roxbury and Methuen. In Texas, clusters of TROs (Spring, Jersey Village, Bunker Hill Village, Sugar Land, and Missouri City) were found primarily in Houston, whereas clusters of reported smoking behaviors were concentrated in West University Place, Aldine, Jersey Village, Spring, and Baytown. Conclusions: Smartphone apps could be used to pair geolocation data with self-reported smoking behavior in order to gain a better understanding of how tobacco product marketing and promotion influence smoking behavior within vulnerable communities. Public health officials could take advantage of smartphone data collection capabilities to implement targeted tobacco control efforts in these strategic locations to reach underserved communities in their built environment. ", doi="10.2196/17451", url="https://www.jmir.org/2020/7/e17451", url="http://www.ncbi.nlm.nih.gov/pubmed/32673252" } @Article{info:doi/10.2196/17257, author="Du, Zhenzhen and Yang, Yujie and Zheng, Jing and Li, Qi and Lin, Denan and Li, Ye and Fan, Jianping and Cheng, Wen and Chen, Xie-Hui and Cai, Yunpeng", title="Accurate Prediction of Coronary Heart Disease for Patients With Hypertension From Electronic Health Records With Big Data and Machine-Learning Methods: Model Development and Performance Evaluation", journal="JMIR Med Inform", year="2020", month="Jul", day="6", volume="8", number="7", pages="e17257", keywords="coronary heart disease", keywords="machine learning", keywords="electronic health records", keywords="predictive algorithms", keywords="hypertension", abstract="Background: Predictions of cardiovascular disease risks based on health records have long attracted broad research interests. Despite extensive efforts, the prediction accuracy has remained unsatisfactory. This raises the question as to whether the data insufficiency, statistical and machine-learning methods, or intrinsic noise have hindered the performance of previous approaches, and how these issues can be alleviated. Objective: Based on a large population of patients with hypertension in Shenzhen, China, we aimed to establish a high-precision coronary heart disease (CHD) prediction model through big data and machine-learning Methods: Data from a large cohort of 42,676 patients with hypertension, including 20,156 patients with CHD onset, were investigated from electronic health records (EHRs) 1-3 years prior to CHD onset (for CHD-positive cases) or during a disease-free follow-up period of more than 3 years (for CHD-negative cases). The population was divided evenly into independent training and test datasets. Various machine-learning methods were adopted on the training set to achieve high-accuracy prediction models and the results were compared with traditional statistical methods and well-known risk scales. Comparison analyses were performed to investigate the effects of training sample size, factor sets, and modeling approaches on the prediction performance. Results: An ensemble method, XGBoost, achieved high accuracy in predicting 3-year CHD onset for the independent test dataset with an area under the receiver operating characteristic curve (AUC) value of 0.943. Comparison analysis showed that nonlinear models (K-nearest neighbor AUC 0.908, random forest AUC 0.938) outperform linear models (logistic regression AUC 0.865) on the same datasets, and machine-learning methods significantly surpassed traditional risk scales or fixed models (eg, Framingham cardiovascular disease risk models). Further analyses revealed that using time-dependent features obtained from multiple records, including both statistical variables and changing-trend variables, helped to improve the performance compared to using only static features. Subpopulation analysis showed that the impact of feature design had a more significant effect on model accuracy than the population size. Marginal effect analysis showed that both traditional and EHR factors exhibited highly nonlinear characteristics with respect to the risk scores. Conclusions: We demonstrated that accurate risk prediction of CHD from EHRs is possible given a sufficiently large population of training data. Sophisticated machine-learning methods played an important role in tackling the heterogeneity and nonlinear nature of disease prediction. Moreover, accumulated EHR data over multiple time points provided additional features that were valuable for risk prediction. Our study highlights the importance of accumulating big data from EHRs for accurate disease predictions. ", doi="10.2196/17257", url="https://medinform.jmir.org/2020/7/e17257", url="http://www.ncbi.nlm.nih.gov/pubmed/32628616" } @Article{info:doi/10.2196/19773, author="de Lusignan, Simon and Jones, Nicholas and Dorward, Jienchi and Byford, Rachel and Liyanage, Harshana and Briggs, John and Ferreira, Filipa and Akinyemi, Oluwafunmi and Amirthalingam, Gayatri and Bates, Chris and Lopez Bernal, Jamie and Dabrera, Gavin and Eavis, Alex and Elliot, J. Alex and Feher, Michael and Krajenbrink, Else and Hoang, Uy and Howsam, Gary and Leach, Jonathan and Okusi, Cecilia and Nicholson, Brian and Nieri, Philip and Sherlock, Julian and Smith, Gillian and Thomas, Mark and Thomas, Nicholas and Tripathy, Manasa and Victor, William and Williams, John and Wood, Ian and Zambon, Maria and Parry, John and O'Hanlon, Shaun and Joy, Mark and Butler, Chris and Marshall, Martin and Hobbs, Richard F. D.", title="The Oxford Royal College of General Practitioners Clinical Informatics Digital Hub: Protocol to Develop Extended COVID-19 Surveillance and Trial Platforms", journal="JMIR Public Health Surveill", year="2020", month="Jul", day="2", volume="6", number="3", pages="e19773", keywords="primary health care", keywords="general practice", keywords="medical record systems, computerized", keywords="sentinel surveillance", keywords="public health surveillance", keywords="clinical trials as a topic", keywords="adaptive clinical trials", keywords="severe acute respiratory syndrome coronavirus 2", keywords="COVID-19", abstract="Background: Routinely recorded primary care data have been used for many years by sentinel networks for surveillance. More recently, real world data have been used for a wider range of research projects to support rapid, inexpensive clinical trials. Because the partial national lockdown in the United Kingdom due to the coronavirus disease (COVID-19) pandemic has resulted in decreasing community disease incidence, much larger numbers of general practices are needed to deliver effective COVID-19 surveillance and contribute to in-pandemic clinical trials. Objective: The aim of this protocol is to describe the rapid design and development of the Oxford Royal College of General Practitioners Clinical Informatics Digital Hub (ORCHID) and its first two platforms. The Surveillance Platform will provide extended primary care surveillance, while the Trials Platform is a streamlined clinical trials platform that will be integrated into routine primary care practice. Methods: We will apply the FAIR (Findable, Accessible, Interoperable, and Reusable) metadata principles to a new, integrated digital health hub that will extract routinely collected general practice electronic health data for use in clinical trials and provide enhanced communicable disease surveillance. The hub will be findable through membership in Health Data Research UK and European metadata repositories. Accessibility through an online application system will provide access to study-ready data sets or developed custom data sets. Interoperability will be facilitated by fixed linkage to other key sources such as Hospital Episodes Statistics and the Office of National Statistics using pseudonymized data. All semantic descriptors (ie, ontologies) and code used for analysis will be made available to accelerate analyses. We will also make data available using common data models, starting with the US Food and Drug Administration Sentinel and Observational Medical Outcomes Partnership approaches, to facilitate international studies. The Surveillance Platform will provide access to data for health protection and promotion work as authorized through agreements between Oxford, the Royal College of General Practitioners, and Public Health England. All studies using the Trials Platform will go through appropriate ethical and other regulatory approval processes. Results: The hub will be a bottom-up, professionally led network that will provide benefits for member practices, our health service, and the population served. Data will only be used for SQUIRE (surveillance, quality improvement, research, and education) purposes. We have already received positive responses from practices, and the number of practices in the network has doubled to over 1150 since February 2020. COVID-19 surveillance has resulted in tripling of the number of virology sites to 293 (target 300), which has aided the collection of the largest ever weekly total of surveillance swabs in the United Kingdom as well as over 3000 severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) serology samples. Practices are recruiting to the PRINCIPLE (Platform Randomised trial of INterventions against COVID-19 In older PeopLE) trial, and these participants will be followed up through ORCHID. These initial outputs demonstrate the feasibility of ORCHID to provide an extended national digital health hub. Conclusions: ORCHID will provide equitable and innovative use of big data through a professionally led national primary care network and the application of FAIR principles. The secure data hub will host routinely collected general practice data linked to other key health care repositories for clinical trials and support enhanced in situ surveillance without always requiring large volume data extracts. ORCHID will support rapid data extraction, analysis, and dissemination with the aim of improving future research and development in general practice to positively impact patient care. International Registered Report Identifier (IRRID): DERR1-10.2196/19773 ", doi="10.2196/19773", url="https://publichealth.jmir.org/2020/3/e19773", url="http://www.ncbi.nlm.nih.gov/pubmed/32484782" } @Article{info:doi/10.2196/16922, author="Musacchio, Nicoletta and Giancaterini, Annalisa and Guaita, Giacomo and Ozzello, Alessandro and Pellegrini, A. Maria and Ponzani, Paola and Russo, T. Giuseppina and Zilich, Rita and de Micheli, Alberto", title="Artificial Intelligence and Big Data in Diabetes Care: A Position Statement of the Italian Association of Medical Diabetologists", journal="J Med Internet Res", year="2020", month="Jun", day="22", volume="22", number="6", pages="e16922", keywords="artificial intelligence", keywords="big data analytics", keywords="clinical decision making", keywords="diabetes management", keywords="health care", doi="10.2196/16922", url="http://www.jmir.org/2020/6/e16922/", url="http://www.ncbi.nlm.nih.gov/pubmed/32568088" } @Article{info:doi/10.2196/19787, author="P{\'e}pin, Louis Jean and Bruno, Maria Rosa and Yang, Rui-Yi and Vercamer, Vincent and Jouhaud, Paul and Escourrou, Pierre and Boutouyrie, Pierre", title="Wearable Activity Trackers for Monitoring Adherence to Home Confinement During the COVID-19 Pandemic Worldwide: Data Aggregation and Analysis", journal="J Med Internet Res", year="2020", month="Jun", day="19", volume="22", number="6", pages="e19787", keywords="wearable activity trackers", keywords="pandemic", keywords="COVID-19", keywords="home confinement", keywords="lockdown", keywords="monitoring", keywords="wearables", keywords="tracking", abstract="Background: In the context of home confinement during the coronavirus disease (COVID-19) pandemic, objective, real-time data are needed to assess populations' adherence to home confinement to adapt policies and control measures accordingly. Objective: The aim of this study was to determine whether wearable activity trackers could provide information regarding users' adherence to home confinement policies because of their capacity for seamless and continuous monitoring of individuals' natural activity patterns regardless of their location. Methods: We analyzed big data from individuals using activity trackers (Withings) that count the wearer's average daily numberof steps in a number of representative nations that adopted different modalities of restriction of citizens' activities. Results: Data on the number of steps per day from over 740,000 individuals around the world were analyzed. We demonstrate the physical activity patterns in several representative countries with total, partial, or no home confinement. The decrease in steps per day in regions with strict total home confinement ranged from 25\% to 54\%. Partial lockdown (characterized by social distancing measures such as school closures, bar and restaurant closures, and cancellation of public meetings but without strict home confinement) does not appear to have a significant impact on people's activity compared to the pre-pandemic period. The absolute level of physical activity under total home confinement in European countries is around twofold that inChina. In some countries, such as France and Spain, physical activity started to gradually decrease even before official commitmentto lockdown as a result of initial less stringent restriction orders or self-quarantine. However, physical activity began to increaseagain in the last 2 weeks, suggesting a decrease in compliance with confinement orders. Conclusions: Aggregate analysis of activity tracker data with the potential for daily updates can provide information regarding adherence to home confinement policies. ", doi="10.2196/19787", url="http://www.jmir.org/2020/6/e19787/", url="http://www.ncbi.nlm.nih.gov/pubmed/32501803" } @Article{info:doi/10.2196/16213, author="Peng, Li-Ning and Hsiao, Fei-Yuan and Lee, Wei-Ju and Huang, Shih-Tsung and Chen, Liang-Kung", title="Comparisons Between Hypothesis- and Data-Driven Approaches for Multimorbidity Frailty Index: A Machine Learning Approach", journal="J Med Internet Res", year="2020", month="Jun", day="11", volume="22", number="6", pages="e16213", keywords="multimorbidity frailty index", keywords="machine learning", keywords="random forest", keywords="unplanned hospitalizations", keywords="intensive care unit admissions", keywords="mortality", abstract="Background: Using big data and the theory of cumulative deficits to develop the multimorbidity frailty index (mFI) has become a widely accepted approach in public health and health care services. However, constructing the mFI using the most critical determinants and stratifying different risk groups with dose-response relationships remain major challenges in clinical practice. Objective: This study aimed to develop the mFI by using machine learning methods that select variables based on the optimal fitness of the model. In addition, we aimed to further establish 4 entities of risk using a machine learning approach that would achieve the best distinction between groups and demonstrate the dose-response relationship. Methods: In this study, we used Taiwan's National Health Insurance Research Database to develop a machine learning multimorbidity frailty index (ML-mFI) using the theory of cumulative diseases/deficits of an individual older person. Compared to the conventional mFI, in which the selection of diseases/deficits is based on expert opinion, we adopted the random forest method to select the most influential diseases/deficits that predict adverse outcomes for older people. To ensure that the survival curves showed a dose-response relationship with overlap during the follow-up, we developed the distance index and coverage index, which can be used at any time point to classify the ML-mFI of all subjects into the categories of fit, mild frailty, moderate frailty, and severe frailty. Survival analysis was conducted to evaluate the ability of the ML-mFI to predict adverse outcomes, such as unplanned hospitalizations, intensive care unit (ICU) admissions, and mortality. Results: The final ML-mFI model contained 38 diseases/deficits. Compared with conventional mFI, both indices had similar distribution patterns by age and sex; however, among people aged 65 to 69 years, the mean mFI and ML-mFI were 0.037 (SD 0.048) and 0.0070 (SD 0.0254), respectively. The difference may result from discrepancies in the diseases/deficits selected in the mFI and the ML-mFI. A total of 86,133 subjects aged 65 to 100 years were included in this study and were categorized into 4 groups according to the ML-mFI. Both the Kaplan-Meier survival curves and Cox models showed that the ML-mFI significantly predicted all outcomes of interest, including all-cause mortality, unplanned hospitalizations, and all-cause ICU admissions at 1, 5, and 8 years of follow-up (P<.01). In particular, a dose-response relationship was revealed between the 4 ML-mFI groups and adverse outcomes. Conclusions: The ML-mFI consists of 38 diseases/deficits that can successfully stratify risk groups associated with all-cause mortality, unplanned hospitalizations, and all-cause ICU admissions in older people, which indicates that precise, patient-centered medical care can be a reality in an aging society. ", doi="10.2196/16213", url="http://www.jmir.org/2020/6/e16213/", url="http://www.ncbi.nlm.nih.gov/pubmed/32525481" } @Article{info:doi/10.2196/16879, author="Schneble, Olivier Christophe and Elger, Simone Bernice and Shaw, Martin David", title="All Our Data Will Be Health Data One Day: The Need for Universal Data Protection and Comprehensive Consent", journal="J Med Internet Res", year="2020", month="May", day="28", volume="22", number="5", pages="e16879", keywords="big data", keywords="health data", keywords="social media", keywords="data protection", keywords="guidelines", keywords="best practices", doi="10.2196/16879", url="http://www.jmir.org/2020/5/e16879/", url="http://www.ncbi.nlm.nih.gov/pubmed/32463372" } @Article{info:doi/10.2196/18323, author="Du, Jian and Li, Xiaoying", title="A Knowledge Graph of Combined Drug Therapies Using Semantic Predications From Biomedical Literature: Algorithm Development", journal="JMIR Med Inform", year="2020", month="Apr", day="28", volume="8", number="4", pages="e18323", keywords="combined drug therapy", keywords="knowledge graph", keywords="knowledge discovery", keywords="semantic predications", abstract="Background: Combination therapy plays an important role in the effective treatment of malignant neoplasms and precision medicine. Numerous clinical studies have been carried out to investigate combination drug therapies. Automated knowledge discovery of these combinations and their graphic representation in knowledge graphs will enable pattern recognition and identification of drug combinations used to treat a specific type of cancer, improve drug efficacy and treatment of human disorders. Objective: This paper aims to develop an automated, visual approach to discover knowledge about combination therapies from biomedical literature, especially from those studies with high-level evidence such as clinical trial reports and clinical practice guidelines. Methods: Based on semantic predications, which consist of a triple structure of subject-predicate-object (SPO), we proposed an automated algorithm to discover knowledge of combination drug therapies using the following rules: 1) two or more semantic predications (S1-P-O and Si-P-O, i = 2, 3\ldots) can be extracted from one conclusive claim (sentence) in the abstract of a given publication, and 2) these predications have an identical predicate (that closely relates to human disease treatment, eg, ``treat'') and object (eg, disease name) but different subjects (eg, drug names). A customized knowledge graph organizes and visualizes these combinations, improving the traditional semantic triples. After automatic filtering of broad concepts such as ``pharmacologic actions'' and generic disease names, a set of combination drug therapies were identified and characterized through manual interpretation. Results: We retrieved 22,263 clinical trial reports and 31 clinical practice guidelines from PubMed abstracts by searching ``antineoplastic agents'' for drug restriction (published between Jan 2009 and Oct 2019). There were 15,603 conclusive claims locally parsed using the search terms ``conclusion*'' and ``conclude*'' ready for semantic predications extraction by SemRep, and 325 candidate groups of semantic predications about combined medications were automatically discovered within 316 conclusive claims. Based on manual analysis, we determined that 255/316 claims (78.46\%) were accurately identified as describing combination therapies and adopted these to construct the customized knowledge graph. We also identified two categories (and 4 subcategories) to characterize the inaccurate results: limitations of SemRep and limitations of proposal. We further learned the predominant patterns of drug combinations based on mechanism of action for new combined medication studies and discovered 4 obvious markers (``combin*,'' ``coadministration,'' ``co-administered,'' and ``regimen'') to identify potential combination therapies to enable development of a machine learning algorithm. Conclusions: Semantic predications from conclusive claims in the biomedical literature can be used to support automated knowledge discovery and knowledge graph construction for combination therapies. A machine learning approach is warranted to take full advantage of the identified markers and other contextual features. ", doi="10.2196/18323", url="http://medinform.jmir.org/2020/4/e18323/", url="http://www.ncbi.nlm.nih.gov/pubmed/32343247" } @Article{info:doi/10.2196/16102, author="Grundstrom, Casandra and Korhonen, Olli and V{\"a}yrynen, Karin and Isomursu, Minna", title="Insurance Customers' Expectations for Sharing Health Data: Qualitative Survey Study", journal="JMIR Med Inform", year="2020", month="Mar", day="26", volume="8", number="3", pages="e16102", keywords="data sharing", keywords="qualitative research", keywords="survey", keywords="health insurance", keywords="insurance", keywords="medical informatics", keywords="health services", abstract="Background: Insurance organizations are essential stakeholders in health care ecosystems. For addressing future health care needs, insurance companies require access to health data to deliver preventative and proactive digital health services to customers. However, extant research is limited in examining the conditions that incentivize health data sharing. Objective: This study aimed to (1) identify the expectations of insurance customers when sharing health data, (2) determine the perceived intrinsic value of health data, and (3) explore the conditions that aid in incentivizing health data sharing in the relationship between an insurance organization and its customer. Methods: A Web-based survey was distributed to randomly selected customers from a Finnish insurance organization through email. A single open-text answer was used for a qualitative data analysis through inductive coding, followed by a thematic analysis. Furthermore, the 4 constructs of commitment, power, reciprocity, and trust from the social exchange theory (SET) were applied as a framework. Results: From the 5000 customers invited to participate, we received 452 surveys (response rate: 9.0\%). Customer characteristics were found to reflect customer demographics. Of the 452 surveys, 48 (10.6\%) open-text responses were skipped by the customer, 57 (12.6\%) customers had no expectations from sharing health data, and 44 (9.7\%) customers preferred to abstain from a data sharing relationship. Using the SET framework, we found that customers expected different conditions to be fulfilled by their insurance provider based on the commitment, power, reciprocity, and trust constructs. Of the 452 customers who completed the surveys, 64 (14.2\%) customers required that the insurance organization meets their data treatment expectations (commitment). Overall, 4.9\% (22/452) of customers were concerned about their health data being used against them to profile their health, to increase insurance prices, or to deny health insurance claims (power). A total of 28.5\% (129/452) of customers expected some form of benefit, such as personalized digital health services, and 29.9\% (135/452) of customers expected finance-related compensation (reciprocity). Furthermore, 7.5\% (34/452) of customers expected some form of empathy from the insurance organization through enhanced transparency or an emotional connection (trust). Conclusions: To aid in the design and development of digital health services, insurance organizations need to address the customers' expectations when sharing their health data. We established the expectations of customers in the social exchange of health data and explored the perceived values of data as intangible goods. Actions by the insurance organization should aim to increase trust through a culture of transparency, commitment to treat health data in a prescribed manner, provide reciprocal benefits through digital health services that customers deem valuable, and assuage fears of health data being used to prevent providing insurance coverage or increase costs. ", doi="10.2196/16102", url="http://medinform.jmir.org/2020/3/e16102/", url="http://www.ncbi.nlm.nih.gov/pubmed/32213467" } @Article{info:doi/10.2196/16492, author="Reiner Benaim, Anat and Almog, Ronit and Gorelik, Yuri and Hochberg, Irit and Nassar, Laila and Mashiach, Tanya and Khamaisi, Mogher and Lurie, Yael and Azzam, S. Zaher and Khoury, Johad and Kurnik, Daniel and Beyar, Rafael", title="Analyzing Medical Research Results Based on Synthetic Data and Their Relation to Real Data Results: Systematic Comparison From Five Observational Studies", journal="JMIR Med Inform", year="2020", month="Feb", day="20", volume="8", number="2", pages="e16492", keywords="synthetic data", keywords="electronic medical records", keywords="MDClone", keywords="validation study", keywords="big data analysis", abstract="Background: Privacy restrictions limit access to protected patient-derived health information for research purposes. Consequently, data anonymization is required to allow researchers data access for initial analysis before granting institutional review board approval. A system installed and activated at our institution enables synthetic data generation that mimics data from real electronic medical records, wherein only fictitious patients are listed. Objective: This paper aimed to validate the results obtained when analyzing synthetic structured data for medical research. A comprehensive validation process concerning meaningful clinical questions and various types of data was conducted to assess the accuracy and precision of statistical estimates derived from synthetic patient data. Methods: A cross-hospital project was conducted to validate results obtained from synthetic data produced for five contemporary studies on various topics. For each study, results derived from synthetic data were compared with those based on real data. In addition, repeatedly generated synthetic datasets were used to estimate the bias and stability of results obtained from synthetic data. Results: This study demonstrated that results derived from synthetic data were predictive of results from real data. When the number of patients was large relative to the number of variables used, highly accurate and strongly consistent results were observed between synthetic and real data. For studies based on smaller populations that accounted for confounders and modifiers by multivariate models, predictions were of moderate accuracy, yet clear trends were correctly observed. Conclusions: The use of synthetic structured data provides a close estimate to real data results and is thus a powerful tool in shaping research hypotheses and accessing estimated analyses, without risking patient privacy. Synthetic data enable broad access to data (eg, for out-of-organization researchers), and rapid, safe, and repeatable analysis of data in hospitals or other health organizations where patient privacy is a primary value. ", doi="10.2196/16492", url="http://medinform.jmir.org/2020/2/e16492/", url="http://www.ncbi.nlm.nih.gov/pubmed/32130148" } @Article{info:doi/10.2196/16765, author="Jiang, Jinglu and Cameron, Ann-Frances and Yang, Ming", title="Analysis of Massive Online Medical Consultation Service Data to Understand Physicians' Economic Return: Observational Data Mining Study", journal="JMIR Med Inform", year="2020", month="Feb", day="18", volume="8", number="2", pages="e16765", keywords="Web-based health services", keywords="remote consultation", keywords="machine learning", keywords="data mining", keywords="decision tree", keywords="patient involvement", abstract="Background: Online health care consultation has become increasingly popular and is considered a potential solution to health care resource shortages and inefficient resource distribution. However, many online medical consultation platforms are struggling to attract and retain patients who are willing to pay, and health care providers on the platform have the additional challenge of standing out in a crowd of physicians who can provide comparable services. Objective: This study used machine learning (ML) approaches to mine massive service data to (1) identify the important features that are associated with patient payment, as opposed to free trial--only appointments; (2) explore the relative importance of these features; and (3) understand how these features interact, linearly or nonlinearly, in relation to payment. Methods: The dataset is from the largest China-based online medical consultation platform, which covers 1,582,564 consultation records between patient-physician pairs from 2009 to 2018. ML techniques (ie, hyperparameter tuning, model training, and validation) were applied with four classifiers---logistic regression, decision tree (DT), random forest, and gradient boost---to identify the most important features and their relative importance for predicting paid vs free-only appointments. Results: After applying the ML feature selection procedures, we identified 11 key features on the platform, which are potentially useful to predict payment. For the binary ML classification task (paid vs free services), the 11 features as a whole system achieved very good prediction performance across all four classifiers. DT analysis further identified five distinct subgroups of patients delineated by five top-ranked features: previous offline connection, total dialog, physician response rate, patient privacy concern, and social return. These subgroups interact with the physician differently, resulting in different payment outcomes. Conclusions: The results show that, compared with features related to physician reputation, service-related features, such as service delivery quality (eg, consultation dialog intensity and physician response rate), patient source (eg, online vs offline returning patients), and patient involvement (eg, provide social returns and reveal previous treatment), appear to contribute more to the patient's payment decision. Promoting multiple timely responses in patient-provider interactions is essential to encourage payment. ", doi="10.2196/16765", url="http://medinform.jmir.org/2020/2/e16765/", url="http://www.ncbi.nlm.nih.gov/pubmed/32069213" } @Article{info:doi/10.2196/16377, author="Lee, J. Edmund W. and Viswanath, Kasisomayajula", title="Big Data in Context: Addressing the Twin Perils of Data Absenteeism and Chauvinism in the Context of Health Disparities Research", journal="J Med Internet Res", year="2020", month="Jan", day="7", volume="22", number="1", pages="e16377", keywords="big data", keywords="artificial intelligence", keywords="health informatics", keywords="wearable electronic devices", keywords="mobile health", keywords="social media", keywords="electronic health records", keywords="digital divide", keywords="health disparities", doi="10.2196/16377", url="https://www.jmir.org/2020/1/e16377", url="http://www.ncbi.nlm.nih.gov/pubmed/31909724" } @Article{info:doi/10.2196/13917, author="Lelong, Romain and Soualmia, F. Lina and Grosjean, Julien and Taalba, Mehdi and Darmoni, J. St{\'e}fan", title="Building a Semantic Health Data Warehouse in the Context of Clinical Trials: Development and Usability Study", journal="JMIR Med Inform", year="2019", month="Dec", day="20", volume="7", number="4", pages="e13917", keywords="data warehousing", keywords="search engine", keywords="semantics", keywords="clinical trial", keywords="patient selection", abstract="Background: The huge amount of clinical, administrative, and demographic data recorded and maintained by hospitals can be consistently aggregated into health data warehouses with a uniform data model. In 2017, Rouen University Hospital (RUH) initiated the design of a semantic health data warehouse enabling both semantic description and retrieval of health information. Objective: This study aimed to present a proof of concept of this semantic health data warehouse, based on the data of 250,000 patients from RUH, and to assess its ability to assist health professionals in prescreening eligible patients in a clinical trials context. Methods: The semantic health data warehouse relies on 3 distinct semantic layers: (1) a terminology and ontology portal, (2) a semantic annotator, and (3) a semantic search engine and NoSQL (not only structured query language) layer to enhance data access performances. The system adopts an entity-centered vision that provides generic search capabilities able to express data requirements in terms of the whole set of interconnected conceptual entities that compose health information. Results: We assessed the ability of the system to assist the search for 95 inclusion and exclusion criteria originating from 5 randomly chosen clinical trials from RUH. The system succeeded in fully automating 39\% (29/74) of the criteria and was efficiently used as a prescreening tool for 73\% (54/74) of them. Furthermore, the targeted sources of information and the search engine--related or data-related limitations that could explain the results for each criterion were also observed. Conclusions: The entity-centered vision contrasts with the usual patient-centered vision adopted by existing systems. It enables more genericity in the information retrieval process. It also allows to fully exploit the semantic description of health information. Despite their semantic annotation, searching within clinical narratives remained the major challenge of the system. A finer annotation of the clinical texts and the addition of specific functionalities would significantly improve the results. The semantic aspect of the system combined with its generic entity-centered vision enables the processing of a large range of clinical questions. However, an important part of health information remains in clinical narratives, and we are currently investigating novel approaches (deep learning) to enhance the semantic annotation of those unstructured data. ", doi="10.2196/13917", url="http://medinform.jmir.org/2019/4/e13917/", url="http://www.ncbi.nlm.nih.gov/pubmed/31859675" } @Article{info:doi/10.2196/16607, author="Lovis, Christian", title="Unlocking the Power of Artificial Intelligence and Big Data in Medicine", journal="J Med Internet Res", year="2019", month="Nov", day="8", volume="21", number="11", pages="e16607", keywords="medical informatics", keywords="artificial intelligence", keywords="big data", doi="10.2196/16607", url="https://www.jmir.org/2019/11/e16607", url="http://www.ncbi.nlm.nih.gov/pubmed/31702565" } @Article{info:doi/10.2196/14083, author="Kim, Mina and Shin, Soo-Yong and Kang, Mira and Yi, Byoung-Kee and Chang, Kyung Dong", title="Developing a Standardization Algorithm for Categorical Laboratory Tests for Clinical Big Data Research: Retrospective Study", journal="JMIR Med Inform", year="2019", month="Aug", day="29", volume="7", number="3", pages="e14083", keywords="standardization", keywords="electronic health records", keywords="data quality", keywords="data science", abstract="Background: Data standardization is essential in electronic health records (EHRs) for both clinical practice and retrospective research. However, it is still not easy to standardize EHR data because of nonidentical duplicates, typographical errors, or inconsistencies. To overcome this drawback, standardization efforts have been undertaken for collecting data in a standardized format as well as for curating the stored data in EHRs. To perform clinical big data research, the stored data in EHR should be standardized, starting from laboratory results, given their importance. However, most of the previous efforts have been based on labor-intensive manual methods. Objective: We aimed to develop an automatic standardization method for eliminating the noises of categorical laboratory data, grouping, and mapping of cleaned data using standard terminology. Methods: We developed a method called standardization algorithm for laboratory test--categorical result (SALT-C) that can process categorical laboratory data, such as pos +, 250 4+ (urinalysis results), and reddish (urinalysis color results). SALT-C consists of five steps. First, it applies data cleaning rules to categorical laboratory data. Second, it categorizes the cleaned data into 5 predefined groups (urine color, urine dipstick, blood type, presence-finding, and pathogenesis tests). Third, all data in each group are vectorized. Fourth, similarity is calculated between the vectors of data and those of each value in the predefined value sets. Finally, the value closest to the data is assigned. Results: The performance of SALT-C was validated using 59,213,696 data points (167,938 unique values) generated over 23 years from a tertiary hospital. Apart from the data whose original meaning could not be interpreted correctly (eg, ** and \_^), SALT-C mapped unique raw data to the correct reference value for each group with accuracy of 97.6\% (123/126; urine color tests), 97.5\% (198/203; (urine dipstick tests), 95\% (53/56; blood type tests), 99.68\% (162,291/162,805; presence-finding tests), and 99.61\% (4643/4661; pathogenesis tests). Conclusions: The proposed SALT-C successfully standardized the categorical laboratory test results with high reliability. SALT-C can be beneficial for clinical big data research by reducing laborious manual standardization efforts. ", doi="10.2196/14083", url="http://medinform.jmir.org/2019/3/e14083/", url="http://www.ncbi.nlm.nih.gov/pubmed/31469075" } @Article{info:doi/10.2196/14126, author="Kim, Heon Ho and Kim, Bora and Joo, Segyeong and Shin, Soo-Yong and Cha, Soung Hyo and Park, Rang Yu", title="Why Do Data Users Say Health Care Data Are Difficult to Use? A Cross-Sectional Survey Study", journal="J Med Internet Res", year="2019", month="Aug", day="06", volume="21", number="8", pages="e14126", keywords="data anonymization", keywords="privacy act", keywords="data sharing", keywords="data protection", keywords="data linking", keywords="health care data demand", abstract="Background: There has been significant effort in attempting to use health care data. However, laws that protect patients' privacy have restricted data use because health care data contain sensitive information. Thus, discussions on privacy laws now focus on the active use of health care data beyond protection. However, current literature does not clarify the obstacles that make data usage and deidentification processes difficult or elaborate on users' needs for data linking from practical perspectives. Objective: The objective of this study is to investigate (1) the current status of data use in each medical area, (2) institutional efforts and difficulties in deidentification processes, and (3) users' data linking needs. Methods: We conducted a cross-sectional online survey. To recruit people who have used health care data, we publicized the promotion campaign and sent official documents to an academic society encouraging participation in the online survey. Results: In total, 128 participants responded to the online survey; 10 participants were excluded for either inconsistent responses or lack of demand for health care data. Finally, 118 participants' responses were analyzed. The majority of participants worked in general hospitals or universities (62/118, 52.5\% and 51/118, 43.2\%, respectively, multiple-choice answers). More than half of participants responded that they have a need for clinical data (82/118, 69.5\%) and public data (76/118, 64.4\%). Furthermore, 85.6\% (101/118) of respondents conducted deidentification measures when using data, and they considered rigid social culture as an obstacle for deidentification (28/101, 27.7\%). In addition, they required data linking (98/118, 83.1\%), and they noted deregulation and data standardization to allow access to health care data linking (33/98, 33.7\% and 38/98, 38.8\%, respectively). There were no significant differences in the proportion of responded data needs and linking in groups that used health care data for either public purposes or commercial purposes. Conclusions: This study provides a cross-sectional view from a practical, user-oriented perspective on the kinds of data users want to utilize, efforts and difficulties in deidentification processes, and the needs for data linking. Most users want to use clinical and public data, and most participants conduct deidentification processes and express a desire to conduct data linking. Our study confirmed that they noted regulation as a primary obstacle whether their purpose is commercial or public. A legal system based on both data utilization and data protection needs is required. ", doi="10.2196/14126", url="https://www.jmir.org/2019/8/e14126/", url="http://www.ncbi.nlm.nih.gov/pubmed/31389335" } @Article{info:doi/10.2196/11966, author="Tobore, Igbe and Li, Jingzhen and Yuhang, Liu and Al-Handarish, Yousef and Kandwal, Abhishek and Nie, Zedong and Wang, Lei", title="Deep Learning Intervention for Health Care Challenges: Some Biomedical Domain Considerations", journal="JMIR Mhealth Uhealth", year="2019", month="Aug", day="02", volume="7", number="8", pages="e11966", keywords="machine learning", keywords="deep learning", keywords="big data", keywords="mHealth", keywords="medical imaging", keywords="electronic health record", keywords="biologicals", keywords="biomedical", keywords="ECG", keywords="EEG", keywords="artificial intelligence", doi="10.2196/11966", url="https://mhealth.jmir.org/2019/8/e11966/", url="http://www.ncbi.nlm.nih.gov/pubmed/31376272" } @Article{info:doi/10.2196/13209, author="Doryab, Afsaneh and Villalba, K. Daniella and Chikersal, Prerna and Dutcher, M. Janine and Tumminia, Michael and Liu, Xinwen and Cohen, Sheldon and Creswell, Kasey and Mankoff, Jennifer and Creswell, D. John and Dey, K. Anind", title="Identifying Behavioral Phenotypes of Loneliness and Social Isolation with Passive Sensing: Statistical Analysis, Data Mining and Machine Learning of Smartphone and Fitbit Data", journal="JMIR Mhealth Uhealth", year="2019", month="Jul", day="24", volume="7", number="7", pages="e13209", keywords="mobile health", keywords="loneliness", keywords="machine learning", keywords="statistical data analysis", keywords="data mining", keywords="digital phenotyping", abstract="Background: Feelings of loneliness are associated with poor physical and mental health. Detection of loneliness through passive sensing on personal devices can lead to the development of interventions aimed at decreasing rates of loneliness. Objective: The aim of this study was to explore the potential of using passive sensing to infer levels of loneliness and to identify the corresponding behavioral patterns. Methods: Data were collected from smartphones and Fitbits (Flex 2) of 160 college students over a semester. The participants completed the University of California, Los Angeles (UCLA) loneliness questionnaire at the beginning and end of the semester. For a classification purpose, the scores were categorized into high (questionnaire score>40) and low (?40) levels of loneliness. Daily features were extracted from both devices to capture activity and mobility, communication and phone usage, and sleep behaviors. The features were then averaged to generate semester-level features. We used 3 analytic methods: (1) statistical analysis to provide an overview of loneliness in college students, (2) data mining using the Apriori algorithm to extract behavior patterns associated with loneliness, and (3) machine learning classification to infer the level of loneliness and the change in levels of loneliness using an ensemble of gradient boosting and logistic regression algorithms with feature selection in a leave-one-student-out cross-validation manner. Results: The average loneliness score from the presurveys and postsurveys was above 43 (presurvey SD 9.4 and postsurvey SD 10.4), and the majority of participants fell into the high loneliness category (scores above 40) with 63.8\% (102/160) in the presurvey and 58.8\% (94/160) in the postsurvey. Scores greater than 1 standard deviation above the mean were observed in 12.5\% (20/160) of the participants in both pre- and postsurvey scores. The majority of scores, however, fell between 1 standard deviation below and above the mean (pre=66.9\% [107/160] and post=73.1\% [117/160]). Our machine learning pipeline achieved an accuracy of 80.2\% in detecting the binary level of loneliness and an 88.4\% accuracy in detecting change in the loneliness level. The mining of associations between classifier-selected behavioral features and loneliness indicated that compared with students with low loneliness, students with high levels of loneliness were spending less time outside of campus during evening hours on weekends and spending less time in places for social events in the evening on weekdays (support=17\% and confidence=92\%). The analysis also indicated that more activity and less sedentary behavior, especially in the evening, was associated with a decrease in levels of loneliness from the beginning of the semester to the end of it (support=31\% and confidence=92\%). Conclusions: Passive sensing has the potential for detecting loneliness in college students and identifying the associated behavioral patterns. These findings highlight intervention opportunities through mobile technology to reduce the impact of loneliness on individuals' health and well-being. ", doi="10.2196/13209", url="http://mhealth.jmir.org/2019/7/e13209/", url="http://www.ncbi.nlm.nih.gov/pubmed/31342903" } @Article{info:doi/10.2196/12876, author="Foufi, Vasiliki and Timakum, Tatsawan and Gaudet-Blavignac, Christophe and Lovis, Christian and Song, Min", title="Mining of Textual Health Information from Reddit: Analysis of Chronic Diseases With Extracted Entities and Their Relations", journal="J Med Internet Res", year="2019", month="Jun", day="13", volume="21", number="6", pages="e12876", keywords="social media", keywords="chronic disease", keywords="data mining", abstract="Background: Social media platforms constitute a rich data source for natural language processing tasks such as named entity recognition, relation extraction, and sentiment analysis. In particular, social media platforms about health provide a different insight into patient's experiences with diseases and treatment than those found in the scientific literature. Objective: This paper aimed to report a study of entities related to chronic diseases and their relation in user-generated text posts. The major focus of our research is the study of biomedical entities found in health social media platforms and their relations and the way people suffering from chronic diseases express themselves. Methods: We collected a corpus of 17,624 text posts from disease-specific subreddits of the social news and discussion website Reddit. For entity and relation extraction from this corpus, we employed the PKDE4J tool developed by Song et al (2015). PKDE4J is a text mining system that integrates dictionary-based entity extraction and rule-based relation extraction in a highly flexible and extensible framework. Results: Using PKDE4J, we extracted 2 types of entities and relations: biomedical entities and relations and subject-predicate-object entity relations. In total, 82,138 entities and 30,341 relation pairs were extracted from the Reddit dataset. The most highly mentioned entities were those related to oncological disease (2884 occurrences of cancer) and asthma (2180 occurrences). The relation pair anatomy-disease was the most frequent (5550 occurrences), the highest frequent entities in this pair being cancer and lymph. The manual validation of the extracted entities showed a very good performance of the system at the entity extraction task (3682/5151, 71.48\% extracted entities were correctly labeled). Conclusions: This study showed that people are eager to share their personal experience with chronic diseases on social media platforms despite possible privacy and security issues. The results reported in this paper are promising and demonstrate the need for more in-depth studies on the way patients with chronic diseases express themselves on social media platforms. ", doi="10.2196/12876", url="http://www.jmir.org/2019/6/e12876/", url="http://www.ncbi.nlm.nih.gov/pubmed/31199327" } @Article{info:doi/10.2196/13583, author="Zheng, Xiaochen and Sun, Shengjing and Mukkamala, Rao Raghava and Vatrapu, Ravi and Ordieres-Mer{\'e}, Joaqu{\'i}n", title="Accelerating Health Data Sharing: A Solution Based on the Internet of Things and Distributed Ledger Technologies", journal="J Med Internet Res", year="2019", month="Jun", day="06", volume="21", number="6", pages="e13583", keywords="Internet of Things", keywords="distributed ledger technologies", keywords="data sharing", keywords="health information interoperability", keywords="IOTA Tangle", keywords="masked authenticated messaging", keywords="blockchain", keywords="intelligent healthcare", abstract="Background: Huge amounts of health-related data are generated every moment with the rapid development of Internet of Things (IoT) and wearable technologies. These big health data contain great value and can bring benefit to all stakeholders in the health care ecosystem. Currently, most of these data are siloed and fragmented in different health care systems or public and private databases. It prevents the fulfillment of intelligent health care inspired by these big data. Security and privacy concerns and the lack of ensured authenticity trails of data bring even more obstacles to health data sharing. With a decentralized and consensus-driven nature, distributed ledger technologies (DLTs) provide reliable solutions such as blockchain, Ethereum, and IOTA Tangle to facilitate the health care data sharing. Objective: This study aimed to develop a health-related data sharing system by integrating IoT and DLT to enable secure, fee-less, tamper-resistant, highly-scalable, and granularly-controllable health data exchange, as well as build a prototype and conduct experiments to verify the feasibility of the proposed solution. Methods: The health-related data are generated by 2 types of IoT devices: wearable devices and stationary air quality sensors. The data sharing mechanism is enabled by IOTA's distributed ledger, the Tangle, which is a directed acyclic graph. Masked Authenticated Messaging (MAM) is adopted to facilitate data communications among different parties. Merkle Hash Tree is used for data encryption and verification. Results: A prototype system was built according to the proposed solution. It uses a smartwatch and multiple air sensors as the sensing layer; a smartphone and a single-board computer (Raspberry Pi) as the gateway; and a local server for data publishing. The prototype was applied to the remote diagnosis of tremor disease. The results proved that the solution could enable costless data integrity and flexible access management during data sharing. Conclusions: DLT integrated with IoT technologies could greatly improve the health-related data sharing. The proposed solution based on IOTA Tangle and MAM could overcome many challenges faced by other traditional blockchain-based solutions in terms of cost, efficiency, scalability, and flexibility in data access management. This study also showed the possibility of fully decentralized health data sharing by replacing the local server with edge computing devices. ", doi="10.2196/13583", url="https://www.jmir.org/2019/6/e13583/", url="http://www.ncbi.nlm.nih.gov/pubmed/31172963" } @Article{info:doi/10.2196/11456, author="Mohan, Diwakar and Bashingwa, Harrisson Jean Juste and Dane, Pierre and Chamberlain, Sara and Tiffin, Nicki and Lefevre, Amnesty", title="Use of Big Data and Machine Learning Methods in the Monitoring and Evaluation of Digital Health Programs in India: An Exploratory Protocol", journal="JMIR Res Protoc", year="2019", month="May", day="24", volume="8", number="5", pages="e11456", keywords="machine learning", keywords="mobile health", keywords="IVR messaging", abstract="Background: Digital health programs, which encompass the subsectors of health information technology, mobile health, electronic health, telehealth, and telemedicine, have the potential to generate ``big data.'' Objective: Our aim is to evaluate two digital health programs in India---the maternal mobile messaging service (Kilkari) and the mobile training resource for frontline health workers (Mobile Academy). We illustrate possible applications of machine learning for public health practitioners that can be applied to generate evidence on program effectiveness and improve implementation. Kilkari is an outbound service that delivers weekly gestational age--appropriate audio messages about pregnancy, childbirth, and childcare directly to families on their mobile phones, starting from the second trimester of pregnancy until the child is one year old. Mobile Academy is an Interactive Voice Response audio training course for accredited social health activists (ASHAs) in India. Methods: Study participants include pregnant and postpartum women (Kilkari) as well as frontline health workers (Mobile Academy) across 13 states in India. Data elements are drawn from system-generated databases used in the routine implementation of programs to provide users with health information. We explain the structure and elements of the extracted data and the proposed process for their linkage. We then outline the various steps to be undertaken to evaluate and select final algorithms for identifying gaps in data quality, poor user performance, predictors for call receipt, user listening levels, and linkages between early listening and continued engagement. Results: The project has obtained the necessary approvals for the use of data in accordance with global standards for handling personal data. The results are expected to be published in August/September 2019. Conclusions: Rigorous evaluations of digital health programs are limited, and few have included applications of machine learning. By describing the steps to be undertaken in the application of machine learning approaches to the analysis of routine system-generated data, we aim to demystify the use of machine learning not only in evaluating digital health education programs but in improving their performance. Where articles on analysis offer an explanation of the final model selected, here we aim to emphasize the process, thereby illustrating to program implementors and evaluators with limited exposure to machine learning its relevance and potential use within the context of broader program implementation and evaluation. International Registered Report Identifier (IRRID): DERR1-10.2196/11456 ", doi="10.2196/11456", url="https://www.researchprotocols.org/2019/5/e11456/", url="http://www.ncbi.nlm.nih.gov/pubmed/31127716" } @Article{info:doi/10.2196/12702, author="Dankar, K. Fida and Madathil, Nisha and Dankar, K. Samar and Boughorbel, Sabri", title="Privacy-Preserving Analysis of Distributed Biomedical Data: Designing Efficient and Secure Multiparty Computations Using Distributed Statistical Learning Theory", journal="JMIR Med Inform", year="2019", month="Apr", day="29", volume="7", number="2", pages="e12702", keywords="data analytics", keywords="data aggregation", keywords="personal genetic information", keywords="patient data privacy", abstract="Background: Biomedical research often requires large cohorts and necessitates the sharing of biomedical data with researchers around the world, which raises many privacy, ethical, and legal concerns. In the face of these concerns, privacy experts are trying to explore approaches to analyzing the distributed data while protecting its privacy. Many of these approaches are based on secure multiparty computations (SMCs). SMC is an attractive approach allowing multiple parties to collectively carry out calculations on their datasets without having to reveal their own raw data; however, it incurs heavy computation time and requires extensive communication between the involved parties. Objective: This study aimed to develop usable and efficient SMC applications that meet the needs of the potential end-users and to raise general awareness about SMC as a tool that supports data sharing. Methods: We have introduced distributed statistical computing (DSC) into the design of secure multiparty protocols, which allows us to conduct computations on each of the parties' sites independently and then combine these computations to form 1 estimator for the collective dataset, thus limiting communication to the final step and reducing complexity. The effectiveness of our privacy-preserving model is demonstrated through a linear regression application. Results: Our secure linear regression algorithm was tested for accuracy and performance using real and synthetic datasets. The results showed no loss of accuracy (over nonsecure regression) and very good performance (20 min for 100 million records). Conclusions: We used DSC to securely calculate a linear regression model over multiple datasets. Our experiments showed very good performance (in terms of the number of records it can handle). We plan to extend our method to other estimators such as logistic regression. ", doi="10.2196/12702", url="http://medinform.jmir.org/2019/2/e12702/", url="http://www.ncbi.nlm.nih.gov/pubmed/31033449" } @Article{info:doi/10.2196/13043, author="McPadden, Jacob and Durant, JS Thomas and Bunch, R. Dustin and Coppi, Andreas and Price, Nathaniel and Rodgerson, Kris and Torre Jr, J. Charles and Byron, William and Hsiao, L. Allen and Krumholz, M. Harlan and Schulz, L. Wade", title="Health Care and Precision Medicine Research: Analysis of a Scalable Data Science Platform", journal="J Med Internet Res", year="2019", month="Apr", day="09", volume="21", number="4", pages="e13043", keywords="data science", keywords="monitoring, physiologic", keywords="computational health care", keywords="medical informatics computing", keywords="big data", abstract="Background: Health care data are increasing in volume and complexity. Storing and analyzing these data to implement precision medicine initiatives and data-driven research has exceeded the capabilities of traditional computer systems. Modern big data platforms must be adapted to the specific demands of health care and designed for scalability and growth. Objective: The objectives of our study were to (1) demonstrate the implementation of a data science platform built on open source technology within a large, academic health care system and (2) describe 2 computational health care applications built on such a platform. Methods: We deployed a data science platform based on several open source technologies to support real-time, big data workloads. We developed data-acquisition workflows for Apache Storm and NiFi in Java and Python to capture patient monitoring and laboratory data for downstream analytics. Results: Emerging data management approaches, along with open source technologies such as Hadoop, can be used to create integrated data lakes to store large, real-time datasets. This infrastructure also provides a robust analytics platform where health care and biomedical research data can be analyzed in near real time for precision medicine and computational health care use cases. Conclusions: The implementation and use of integrated data science platforms offer organizations the opportunity to combine traditional datasets, including data from the electronic health record, with emerging big data sources, such as continuous patient monitoring and real-time laboratory results. These platforms can enable cost-effective and scalable analytics for the information that will be key to the delivery of precision medicine initiatives. Organizations that can take advantage of the technical advances found in data science platforms will have the opportunity to provide comprehensive access to health care data for computational health care and precision medicine research. ", doi="10.2196/13043", url="https://www.jmir.org/2019/4/e13043/", url="http://www.ncbi.nlm.nih.gov/pubmed/30964441" } @Article{info:doi/10.2196/12577, author="Tang, Chunlei and Sun, Huajun and Xiong, Yun and Yang, Jiahong and Vitale, Christopher and Ruan, Lu and Ai, Angela and Yu, Guangjun and Ma, Jing and Bates, David", title="Medication Use for Childhood Pneumonia at a Children's Hospital in Shanghai, China: Analysis of Pattern Mining Algorithms", journal="JMIR Med Inform", year="2019", month="Mar", day="22", volume="7", number="1", pages="e12577", keywords="drug therapy", keywords="combination", keywords="computer-assisted", keywords="pattern recognition", keywords="data mining", keywords="precision medicine", keywords="childhood pneumonia", keywords="hospital", abstract="Background: Pattern mining utilizes multiple algorithms to explore objective and sometimes unexpected patterns in real-world data. This technique could be applied to electronic medical record data mining; however, it first requires a careful clinical assessment and validation. Objective: The aim of this study was to examine the use of pattern mining techniques on a large clinical dataset to detect treatment and medication use patterns for childhood pneumonia. Methods: We applied 3 pattern mining algorithms to 680,138 medication administration records from 30,512 childhood inpatients with diagnosis of pneumonia during a 6-year period at a children's hospital in China. Patients' ages ranged from 0 to 17 years, where 37.53\% (11,453/30,512) were 0 to 3 months old, 86.55\% (26,408/30,512) were under 5 years, 60.37\% (18,419/30,512) were male, and 60.10\% (18,338/30,512) had a hospital stay of 9 to 15 days. We used the FP-Growth, PrefixSpan, and USpan pattern mining algorithms. The first 2 are more traditional methods of pattern mining and mine a complete set of frequent medication use patterns. PrefixSpan also incorporates an administration sequence. The newer USpan method considers medication utility, defined by the dose, frequency, and timing of use of the 652 individual medications in the dataset. Together, these 3 methods identified the top 10 patterns from 6 age groups, forming a total of 180 distinct medication combinations. These medications encompassed the top 40 (73.66\%, 500,982/680,138) most frequently used medications. These patterns were then evaluated by subject matter experts to summarize 5 medication use and 2 treatment patterns. Results: We identified 5 medication use patterns: (1) antiasthmatics and expectorants and corticosteroids, (2) antibiotics and (antiasthmatics or expectorants or corticosteroids), (3) third-generation cephalosporin antibiotics with (or followed by) traditional antibiotics, (4) antibiotics and (medications for enteritis or skin diseases), and (5) (antiasthmatics or expectorants or corticosteroids) and (medications for enteritis or skin diseases). We also identified 2 frequent treatment patterns: (1) 42.89\% (291,701/680,138) of specific medication administration records were of intravenous therapy with antibiotics, diluents, and nutritional supplements and (2) 11.53\% (78,390/680,138) were of various combinations of inhalation of antiasthmatics, expectorants, or corticosteroids. Fleiss kappa for the subject experts' evaluation was 0.693, indicating moderate agreement. Conclusions: Utilizing a pattern mining approach, we summarized 5 medication use patterns and 2 treatment patterns. These warrant further investigation. ", doi="10.2196/12577", url="http://medinform.jmir.org/2019/1/e12577/", url="http://www.ncbi.nlm.nih.gov/pubmed/30900998" } @Article{info:doi/10.2196/11732, author="Bezemer, Tim and de Groot, CH Mark and Blasse, Enja and ten Berg, J. Maarten and Kappen, H. Teus and Bredenoord, L. Annelien and van Solinge, W. Wouter and Hoefer, E. Imo and Haitjema, Saskia", title="A Human(e) Factor in Clinical Decision Support Systems", journal="J Med Internet Res", year="2019", month="Mar", day="19", volume="21", number="3", pages="e11732", keywords="clinical decision support", keywords="big data", keywords="artificial intelligence", keywords="machine learning", keywords="deep learning", keywords="precision medicine", keywords="expert systems", keywords="data science", keywords="health care providers", doi="10.2196/11732", url="http://www.jmir.org/2019/3/e11732/", url="http://www.ncbi.nlm.nih.gov/pubmed/30888324" } @Article{info:doi/10.2196/10013, author="Woo, Hyunki and Kim, Kyunga and Cha, KyeongMin and Lee, Jin-Young and Mun, Hansong and Cho, Jin Soo and Chung, In Ji and Pyo, Hui Jeung and Lee, Kun-Chul and Kang, Mira", title="Application of Efficient Data Cleaning Using Text Clustering for Semistructured Medical Reports to Large-Scale Stool Examination Reports: Methodology Study", journal="J Med Internet Res", year="2019", month="Jan", day="08", volume="21", number="1", pages="e10013", keywords="data cleaning", keywords="text clustering", keywords="key collision", keywords="nearest neighbor methods", keywords="OpenRefine", abstract="Background: Since medical research based on big data has become more common, the community's interest and effort to analyze a large amount of semistructured or unstructured text data, such as examination reports, have rapidly increased. However, these large-scale text data are often not readily applicable to analysis owing to typographical errors, inconsistencies, or data entry problems. Therefore, an efficient data cleaning process is required to ensure the veracity of such data. Objective: In this paper, we proposed an efficient data cleaning process for large-scale medical text data, which employs text clustering methods and value-converting technique, and evaluated its performance with medical examination text data. Methods: The proposed data cleaning process consists of text clustering and value-merging. In the text clustering step, we suggested the use of key collision and nearest neighbor methods in a complementary manner. Words (called values) in the same cluster would be expected as a correct value and its wrong representations. In the value-converting step, wrong values for each identified cluster would be converted into their correct value. We applied these data cleaning process to 574,266 stool examination reports produced for parasite analysis at Samsung Medical Center from 1995 to 2015. The performance of the proposed process was examined and compared with data cleaning processes based on a single clustering method. We used OpenRefine 2.7, an open source application that provides various text clustering methods and an efficient user interface for value-converting with common-value suggestion. Results: A total of 1,167,104 words in stool examination reports were surveyed. In the data cleaning process, we discovered 30 correct words and 45 patterns of typographical errors and duplicates. We observed high correction rates for words with typographical errors (98.61\%) and typographical error patterns (97.78\%). The resulting data accuracy was nearly 100\% based on the number of total words. Conclusions: Our data cleaning process based on the combinatorial use of key collision and nearest neighbor methods provides an efficient cleaning of large-scale text data and hence improves data accuracy. ", doi="10.2196/10013", url="https://www.jmir.org/2019/1/e10013/", url="http://www.ncbi.nlm.nih.gov/pubmed/30622098" } @Article{info:doi/10.2196/resprot.9589, author="Kotz, David and Lord, E. Sarah and O'Malley, James A. and Stark, Luke and Marsch, A. Lisa", title="Workshop on Emerging Technology and Data Analytics for Behavioral Health", journal="JMIR Res Protoc", year="2018", month="Jun", day="20", volume="7", number="6", pages="e158", keywords="behavioral health", keywords="mobile technology", keywords="wearable devices", keywords="data analytics", keywords="mHealth", doi="10.2196/resprot.9589", url="http://www.researchprotocols.org/2018/6/e158/", url="http://www.ncbi.nlm.nih.gov/pubmed/29925493" } @Article{info:doi/10.2196/medinform.9455, author="Huang, Yingxiang and Lee, Junghye and Wang, Shuang and Sun, Jimeng and Liu, Hongfang and Jiang, Xiaoqian", title="Privacy-Preserving Predictive Modeling: Harmonization of Contextual Embeddings From Different Sources", journal="JMIR Med Inform", year="2018", month="May", day="16", volume="6", number="2", pages="e33", keywords="interoperability", keywords="contextual embedding", keywords="predictive models", keywords="patient data privacy", abstract="Background: Data sharing has been a big challenge in biomedical informatics because of privacy concerns. Contextual embedding models have demonstrated a very strong representative capability to describe medical concepts (and their context), and they have shown promise as an alternative way to support deep-learning applications without the need to disclose original data. However, contextual embedding models acquired from individual hospitals cannot be directly combined because their embedding spaces are different, and naive pooling renders combined embeddings useless. Objective: The aim of this study was to present a novel approach to address these issues and to promote sharing representation without sharing data. Without sacrificing privacy, we also aimed to build a global model from representations learned from local private data and synchronize information from multiple sources. Methods: We propose a methodology that harmonizes different local contextual embeddings into a global model. We used Word2Vec to generate contextual embeddings from each source and Procrustes to fuse different vector models into one common space by using a list of corresponding pairs as anchor points. We performed prediction analysis with harmonized embeddings. Results: We used sequential medical events extracted from the Medical Information Mart for Intensive Care III database to evaluate the proposed methodology in predicting the next likely diagnosis of a new patient using either structured data or unstructured data. Under different experimental scenarios, we confirmed that the global model built from harmonized local models achieves a more accurate prediction than local models and global models built from naive pooling. Conclusions: Such aggregation of local models using our unique harmonization can serve as the proxy for a global model, combining information from a wide range of institutions and information sources. It allows information unique to a certain hospital to become available to other sites, increasing the fluidity of information flow in health care. ", doi="10.2196/medinform.9455", url="http://medinform.jmir.org/2018/2/e33/", url="http://www.ncbi.nlm.nih.gov/pubmed/29769172" } @Article{info:doi/10.2196/medinform.8724, author="Ho, Kiki Hoi Ki and G{\"o}rges, Matthias and Portales-Casamar, Elodie", title="Data Access and Usage Practices Across a Cohort of Researchers at a Large Tertiary Pediatric Hospital: Qualitative Survey Study", journal="JMIR Med Inform", year="2018", month="May", day="14", volume="6", number="2", pages="e32", keywords="clinical data sharing, research barriers, data linkage, data sources, data management, environmental scan, research facilitation", abstract="Background: Health and health-related data collected as part of clinical care is a foundational component of quality improvement and research. While the importance of these data is widely recognized, there are many challenges faced by researchers attempting to use such data. It is crucial to acknowledge and identify barriers to improve data sharing and access practices and ultimately optimize research capacity. Objective: To better understand the current state, explore opportunities, and identify barriers, an environmental scan of investigators at BC Children's Hospital Research Institute (BCCHR) was conducted to elucidate current local practices around data access and usage. Methods: The Clinical and Community Data, Analytics and Informatics group at BCCHR comprises over 40 investigators with diverse expertise and interest in data who share a common goal of facilitating data collection, usage, and access across the community. Semistructured interviews with 35 of these researchers were conducted, and data were summarized qualitatively. A total impact score, considering both frequency with which a problem occurs and the impact of the problem, was calculated for each item to prioritize and rank barriers. Results: Three main themes for barriers emerged: the lengthy turnaround time before data access (18/35, 51\%), inconsistent and opaque data access processes (16/35, 46\%), and the inability to link data (15/35, 43\%) effectively. Less frequent themes included quality and usability of data, ethics and privacy review barriers, lack of awareness of data sources, and efforts required duplicating data extraction and linkage. The two main opportunities for improvement were data access facilitation (14/32, 44\%) and migration toward a single data platform (10/32, 31\%). Conclusions: By identifying the current state and needs of the data community onsite, this study enables us to focus our resources on combating the challenges having the greatest impact on researchers. The current state parallels that of the national landscape. By ensuring protection of privacy while achieving efficient data access, research institutions will be able to maximize their research capacity, a crucial step towards achieving the ultimate and shared goal between all stakeholders---to better health outcomes. ", doi="10.2196/medinform.8724", url="http://medinform.jmir.org/2018/2/e32/", url="http://www.ncbi.nlm.nih.gov/pubmed/29759958" } @Article{info:doi/10.2196/medinform.7992, author="Shachar, Netta and Mitelpunkt, Alexis and Kozlovski, Tal and Galili, Tal and Frostig, Tzviel and Brill, Barak and Marcus-Kalish, Mira and Benjamini, Yoav", title="The Importance of Nonlinear Transformations Use in Medical Data Analysis", journal="JMIR Med Inform", year="2018", month="May", day="11", volume="6", number="2", pages="e27", keywords="data mining", keywords="statistics", keywords="preprocessing", keywords="medical informatics", keywords="health informatics", keywords="big data", keywords="transformations", abstract="Background: The accumulation of data and its accessibility through easier-to-use platforms will allow data scientists and practitioners who are less sophisticated data analysts to get answers by using big data for many purposes in multiple ways. Data scientists working with medical data are aware of the importance of preprocessing, yet in many cases, the potential benefits of using nonlinear transformations is overlooked. Objective: Our aim is to present a semi-automated approach of symmetry-aiming transformations tailored for medical data analysis and its advantages. Methods: We describe 10 commonly encountered data types used in the medical field and the relevant transformations for each data type. Data from the Alzheimer's Disease Neuroimaging Initiative study, Parkinson's disease hospital cohort, and disease-simulating data were used to demonstrate the approach and its benefits. Results: Symmetry-targeted monotone transformations were applied, and the advantages gained in variance, stability, linearity, and clustering are demonstrated. An open source application implementing the described methods was developed. Both linearity of relationships and increase of stability of variability improved after applying proper nonlinear transformation. Clustering simulated nonsymmetric data gave low agreement to the generating clusters (Rand value=0.681), while capturing the original structure after applying nonlinear transformation to symmetry (Rand value=0.986). Conclusions: This work presents the use of nonlinear transformations for medical data and the importance of their semi-automated choice. Using the described approach, the data analyst increases the ability to create simpler, more robust and translational models, thereby facilitating the interpretation and implementation of the analysis by medical practitioners. Applying nonlinear transformations as part of the preprocessing is essential to the quality and interpretability of results. ", doi="10.2196/medinform.7992", url="http://medinform.jmir.org/2018/2/e27/", url="http://www.ncbi.nlm.nih.gov/pubmed/29752251" } @Article{info:doi/10.2196/medinform.9170, author="P Tafti, Ahmad and Badger, Jonathan and LaRose, Eric and Shirzadi, Ehsan and Mahnke, Andrea and Mayer, John and Ye, Zhan and Page, David and Peissig, Peggy", title="Adverse Drug Event Discovery Using Biomedical Literature: A Big Data Neural Network Adventure", journal="JMIR Med Inform", year="2017", month="Dec", day="08", volume="5", number="4", pages="e51", keywords="adverse drug event", keywords="adverse drug reaction", keywords="drug side effects", keywords="machine learning", keywords="text mining", abstract="Background: The study of adverse drug events (ADEs) is a tenured topic in medical literature. In recent years, increasing numbers of scientific articles and health-related social media posts have been generated and shared daily, albeit with very limited use for ADE study and with little known about the content with respect to ADEs. Objective: The aim of this study was to develop a big data analytics strategy that mines the content of scientific articles and health-related Web-based social media to detect and identify ADEs. Methods: We analyzed the following two data sources: (1) biomedical articles and (2) health-related social media blog posts. We developed an intelligent and scalable text mining solution on big data infrastructures composed of Apache Spark, natural language processing, and machine learning. This was combined with an Elasticsearch No-SQL distributed database to explore and visualize ADEs. Results: The accuracy, precision, recall, and area under receiver operating characteristic of the system were 92.7\%, 93.6\%, 93.0\%, and 0.905, respectively, and showed better results in comparison with traditional approaches in the literature. This work not only detected and classified ADE sentences from big data biomedical literature but also scientifically visualized ADE interactions. Conclusions: To the best of our knowledge, this work is the first to investigate a big data machine learning strategy for ADE discovery on massive datasets downloaded from PubMed Central and social media. This contribution illustrates possible capacities in big data biomedical text analysis using advanced computational methods with real-time update from new data published on a daily basis. ", doi="10.2196/medinform.9170", url="http://medinform.jmir.org/2017/4/e51/", url="http://www.ncbi.nlm.nih.gov/pubmed/29222076" } @Article{info:doi/10.2196/resprot.7757, author="Luo, Gang and Stone, L. Bryan and Johnson, D. Michael and Tarczy-Hornoch, Peter and Wilcox, B. Adam and Mooney, D. Sean and Sheng, Xiaoming and Haug, J. Peter and Nkoy, L. Flory", title="Automating Construction of Machine Learning Models With Clinical Big Data: Proposal Rationale and Methods", journal="JMIR Res Protoc", year="2017", month="Aug", day="29", volume="6", number="8", pages="e175", keywords="machine learning", keywords="automated temporal aggregation", keywords="automatic model selection", keywords="care management", keywords="clinical big data", abstract="Background: To improve health outcomes and cut health care costs, we often need to conduct prediction/classification using large clinical datasets (aka, clinical big data), for example, to identify high-risk patients for preventive interventions. Machine learning has been proposed as a key technology for doing this. Machine learning has won most data science competitions and could support many clinical activities, yet only 15\% of hospitals use it for even limited purposes. Despite familiarity with data, health care researchers often lack machine learning expertise to directly use clinical big data, creating a hurdle in realizing value from their data. Health care researchers can work with data scientists with deep machine learning knowledge, but it takes time and effort for both parties to communicate effectively. Facing a shortage in the United States of data scientists and hiring competition from companies with deep pockets, health care systems have difficulty recruiting data scientists. Building and generalizing a machine learning model often requires hundreds to thousands of manual iterations by data scientists to select the following: (1) hyper-parameter values and complex algorithms that greatly affect model accuracy and (2) operators and periods for temporally aggregating clinical attributes (eg, whether a patient's weight kept rising in the past year). This process becomes infeasible with limited budgets. Objective: This study's goal is to enable health care researchers to directly use clinical big data, make machine learning feasible with limited budgets and data scientist resources, and realize value from data. Methods: This study will allow us to achieve the following: (1) finish developing the new software, Automated Machine Learning (Auto-ML), to automate model selection for machine learning with clinical big data and validate Auto-ML on seven benchmark modeling problems of clinical importance; (2) apply Auto-ML and novel methodology to two new modeling problems crucial for care management allocation and pilot one model with care managers; and (3) perform simulations to estimate the impact of adopting Auto-ML on US patient outcomes. Results: We are currently writing Auto-ML's design document. We intend to finish our study by around the year 2022. Conclusions: Auto-ML will generalize to various clinical prediction/classification problems. With minimal help from data scientists, health care researchers can use Auto-ML to quickly build high-quality models. This will boost wider use of machine learning in health care and improve patient outcomes. ", doi="10.2196/resprot.7757", url="http://www.researchprotocols.org/2017/8/e175/", url="http://www.ncbi.nlm.nih.gov/pubmed/28851678" } @Article{info:doi/10.2196/medinform.6799, author="Wang, Yaogang and Sun, Li and Hou, Jie", title="Hierarchical Medical System Based on Big Data and Mobile Internet: A New Strategic Choice in Health Care", journal="JMIR Med Inform", year="2017", month="Aug", day="08", volume="5", number="3", pages="e22", keywords="medical services", keywords="continuity of patient care", keywords="mobile health", doi="10.2196/medinform.6799", url="http://medinform.jmir.org/2017/3/e22/", url="http://www.ncbi.nlm.nih.gov/pubmed/28790024" } @Article{info:doi/10.2196/medinform.6690, author="Lee, Joon", title="Patient-Specific Predictive Modeling Using Random Forests: An Observational Study for the Critically Ill", journal="JMIR Med Inform", year="2017", month="Jan", day="17", volume="5", number="1", pages="e3", keywords="forecasting", keywords="critical care", keywords="predictive analytics", keywords="patient similarity", keywords="random forest", abstract="Background: With a large-scale electronic health record repository, it is feasible to build a customized patient outcome prediction model specifically for a given patient. This approach involves identifying past patients who are similar to the present patient and using their data to train a personalized predictive model. Our previous work investigated a cosine-similarity patient similarity metric (PSM) for such patient-specific predictive modeling. Objective: The objective of the study is to investigate the random forest (RF) proximity measure as a PSM in the context of personalized mortality prediction for intensive care unit (ICU) patients. Methods: A total of 17,152 ICU admissions were extracted from the Multiparameter Intelligent Monitoring in Intensive Care II database. A number of predictor variables were extracted from the first 24 hours in the ICU. Outcome to be predicted was 30-day mortality. A patient-specific predictive model was trained for each ICU admission using an RF PSM inspired by the RF proximity measure. Death counting, logistic regression, decision tree, and RF models were studied with a hard threshold applied to RF PSM values to only include the M most similar patients in model training, where M was varied. In addition, case-specific random forests (CSRFs), which uses RF proximity for weighted bootstrapping, were trained. Results: Compared to our previous study that investigated a cosine similarity PSM, the RF PSM resulted in superior or comparable predictive performance. RF and CSRF exhibited the best performances (in terms of mean area under the receiver operating characteristic curve [95\% confidence interval], RF: 0.839 [0.835-0.844]; CSRF: 0.832 [0.821-0.843]). RF and CSRF did not benefit from personalization via the use of the RF PSM, while the other models did. Conclusions: The RF PSM led to good mortality prediction performance for several predictive models, although it failed to induce improved performance in RF and CSRF. The distinction between predictor and similarity variables is an important issue arising from the present study. RFs present a promising method for patient-specific outcome prediction. ", doi="10.2196/medinform.6690", url="http://medinform.jmir.org/2017/1/e3/", url="http://www.ncbi.nlm.nih.gov/pubmed/28096065" } @Article{info:doi/10.2196/jmir.5870, author="Luo, Wei and Phung, Dinh and Tran, Truyen and Gupta, Sunil and Rana, Santu and Karmakar, Chandan and Shilton, Alistair and Yearwood, John and Dimitrova, Nevenka and Ho, Bao Tu and Venkatesh, Svetha and Berk, Michael", title="Guidelines for Developing and Reporting Machine Learning Predictive Models in Biomedical Research: A Multidisciplinary View", journal="J Med Internet Res", year="2016", month="Dec", day="16", volume="18", number="12", pages="e323", keywords="machine learning", keywords="clinical prediction rule", keywords="guideline", abstract="Background: As more and more researchers are turning to big data for new opportunities of biomedical discoveries, machine learning models, as the backbone of big data analysis, are mentioned more often in biomedical journals. However, owing to the inherent complexity of machine learning methods, they are prone to misuse. Because of the flexibility in specifying machine learning models, the results are often insufficiently reported in research articles, hindering reliable assessment of model validity and consistent interpretation of model outputs. Objective: To attain a set of guidelines on the use of machine learning predictive models within clinical settings to make sure the models are correctly applied and sufficiently reported so that true discoveries can be distinguished from random coincidence. Methods: A multidisciplinary panel of machine learning experts, clinicians, and traditional statisticians were interviewed, using an iterative process in accordance with the Delphi method. Results: The process produced a set of guidelines that consists of (1) a list of reporting items to be included in a research article and (2) a set of practical sequential steps for developing predictive models. Conclusions: A set of guidelines was generated to enable correct application of machine learning models and consistent reporting of model specifications and results in biomedical research. We believe that such guidelines will accelerate the adoption of big data analysis, particularly with machine learning methods, in the biomedical research community. ", doi="10.2196/jmir.5870", url="http://www.jmir.org/2016/12/e323/", url="http://www.ncbi.nlm.nih.gov/pubmed/27986644" } @Article{info:doi/10.2196/iproc.6129, author="Karhade, V. Aditya and Senders, Joeky and Broekman, L. Marike and Gormley, B. William and Smith, R. Timothy", title="Disruptive Innovation in Neurosurgical Outcomes Research: The Impact of Big Data, Predictive Analytics, and Wearable Technology", journal="iproc", year="2016", month="Dec", day="14", volume="2", number="1", pages="e10", keywords="neurosurgery", keywords="outcomes measures", keywords="artificial intelligence", keywords="big data", keywords="wearable technology", keywords="predictive analytics", abstract="Background: The value agenda in healthcare has created legislative reform, merit-based reimbursement systems, public reporting of surgeon scorecards, and patient-centered neurosurgical outcomes tracking. Though technological innovations for the intra-operative experience continue to abound, technological advances such as artificial intelligence, big data, and wearable technology have yet to become standard tools for outcomes measures in neurosurgery. Objective: The purpose of this work was to review existing tools for outcomes research in neurosurgery and to characterize the disruptive innovation created by artificial intelligence, big data, and wearable technology. Methods: Gold standards for neurosurgical patient-reported outcomes were compared to ongoing work in our center as well as major developments in the fields of mobile health, computer science, and health informatics. Results: The gold standards for neurosurgical outcomes measures (pain scale, Oswestry Disability Index, Euro-Qol 5D, Short Form Health Survey, etc.) provide limited information on time-dependent, longitudinal patient recovery outside of the clinical setting. Our work with smartphone-enabled passively collected data allows for continuous, real-time monitoring of 9 different data streams generating over 1 million data points per day per patient. Artificial intelligence capabilities, including natural language processing and machine learning, quantify and digitize patient quality of life from electronic medical records, audio recordings, and free text notes. Quantification of patient outcomes is further aided by the creation of wearable physiological sensors specific to neurosurgery, such as a serum sodium sensing wearable with WiFi communication capabilities to prevent complications and readmissions of delayed symptomatic hyponatremia post-transsphenoidal surgery. Conclusions: Systems-level risk adjustment, high-value care, and real-time tracking of functional recovery is enabled by passively collected data. The future of outcomes measures in neurosurgery requires the translation of validated, gold standard assessments into the modern era of big data, artificial intelligence, and wearable technology. ", doi="10.2196/iproc.6129", url="http://www.iproc.org/2016/1/e10/" } @Article{info:doi/10.2196/medinform.5359, author="Kruse, Scott Clemens and Goswamy, Rishi and Raval, Yesha and Marawi, Sarah", title="Challenges and Opportunities of Big Data in Health Care: A Systematic Review", journal="JMIR Med Inform", year="2016", month="Nov", day="21", volume="4", number="4", pages="e38", keywords="big data", keywords="analytics", keywords="health care", keywords="human genome", keywords="electronic medical record", abstract="Background: Big data analytics offers promise in many business sectors, and health care is looking at big data to provide answers to many age-related issues, particularly dementia and chronic disease management. Objective: The purpose of this review was to summarize the challenges faced by big data analytics and the opportunities that big data opens in health care. Methods: A total of 3 searches were performed for publications between January 1, 2010 and January 1, 2016 (PubMed/MEDLINE, CINAHL, and Google Scholar), and an assessment was made on content germane to big data in health care. From the results of the searches in research databases and Google Scholar (N=28), the authors summarized content and identified 9 and 14 themes under the categories Challenges and Opportunities, respectively. We rank-ordered and analyzed the themes based on the frequency of occurrence. Results: The top challenges were issues of data structure, security, data standardization, storage and transfers, and managerial skills such as data governance. The top opportunities revealed were quality improvement, population management and health, early detection of disease, data quality, structure, and accessibility, improved decision making, and cost reduction. Conclusions: Big data analytics has the potential for positive impact and global implications; however, it must overcome some legitimate obstacles. ", doi="10.2196/medinform.5359", url="http://medinform.jmir.org/2016/4/e38/", url="http://www.ncbi.nlm.nih.gov/pubmed/27872036" } @Article{info:doi/10.2196/medinform.6437, author="Luo, Jake and Eldredge, Christina and Cho, C. Chi and Cisler, A. Ron", title="Population Analysis of Adverse Events in Different Age Groups Using Big Clinical Trials Data", journal="JMIR Med Inform", year="2016", month="Oct", day="17", volume="4", number="4", pages="e30", keywords="big data analysis", keywords="adverse events", keywords="clinical trial data", keywords="population health", keywords="clinical trial safety", keywords="data processing and integration", abstract="Background: Understanding adverse event patterns in clinical studies across populations is important for patient safety and protection in clinical trials as well as for developing appropriate drug therapies, procedures, and treatment plans. Objectives: The objective of our study was to conduct a data-driven population-based analysis to estimate the incidence, diversity, and association patterns of adverse events by age of the clinical trials patients and participants. Methods: Two aspects of adverse event patterns were measured: (1) the adverse event incidence rate in each of the patient age groups and (2) the diversity of adverse events defined as distinct types of adverse events categorized by organ system. Statistical analysis was done on the summarized clinical trial data. The incident rate and diversity level in each of the age groups were compared with the lowest group (reference group) using t tests. Cohort data was obtained from ClinicalTrials.gov, and 186,339 clinical studies were analyzed; data were extracted from the 17,853 clinical trials that reported clinical outcomes. The total number of clinical trial participants was 6,808,619, and total number of participants affected by adverse events in these trials was 1,840,432. The trial participants were divided into eight different age groups to support cross-age group comparison. Results: In general, children and older patients are more susceptible to adverse events in clinical trial studies. Using the lowest incidence age group as the reference group (20-29 years), the incidence rate of the 0-9 years-old group was 31.41\%, approximately 1.51 times higher (P=.04) than the young adult group (20-29 years) at 20.76\%. The second-highest group is the 50-59 years-old group with an incidence rate of 30.09\%, significantly higher (P<.001) when compared with the lowest incidence in the 20-29 years-old group. The adverse event diversity also increased with increase in patient age. Clinical studies that recruited older patients (older than 40 years) were more likely to observe a diverse range of adverse events (P<.001). Adverse event diversity increased at an average rate of 77\% for each age group (older than 30 years) until reaching the 60-69 years-old group, which had a diversity level of 54.7 different types of adverse events per trial arm. The 70-100 years-old group showed the highest diversity level of 55.5 events per trial arm, which is approximately 3.44 times more than the 20-29 years-old group (P<.001). We also observe that adverse events display strong age-related patterns among different categories. Conclusion: The results show that there is a significant adverse event variance at the population level between different age groups in clinical trials. The data suggest that age-associated adverse events should be considered in planning, monitoring, and regulating clinical trials. ", doi="10.2196/medinform.6437", url="http://medinform.jmir.org/2016/4/e30/", url="http://www.ncbi.nlm.nih.gov/pubmed/27751983" } @Article{info:doi/10.2196/jmir.5549, author="Van Poucke, Sven and Thomeer, Michiel and Heath, John and Vukicevic, Milan", title="Are Randomized Controlled Trials the (G)old Standard? From Clinical Intelligence to Prescriptive Analytics", journal="J Med Internet Res", year="2016", month="Jul", day="06", volume="18", number="7", pages="e185", keywords="randomized controlled trials", keywords="data mining", keywords="big data", keywords="predictive analytics", keywords="algorithm", keywords="modeling", keywords="ensemble methods", doi="10.2196/jmir.5549", url="http://www.jmir.org/2016/7/e185/", url="http://www.ncbi.nlm.nih.gov/pubmed/27383622" } @Article{info:doi/10.2196/medinform.5571, author="Lea, Christopher Nathan and Nicholls, Jacqueline and Dobbs, Christine and Sethi, Nayha and Cunningham, James and Ainsworth, John and Heaven, Martin and Peacock, Trevor and Peacock, Anthony and Jones, Kerina and Laurie, Graeme and Kalra, Dipak", title="Data Safe Havens and Trust: Toward a Common Understanding of Trusted Research Platforms for Governing Secure and Ethical Health Research", journal="JMIR Med Inform", year="2016", month="Jun", day="21", volume="4", number="2", pages="e22", keywords="trusted research platforms", keywords="data safe havens", keywords="trusted researchers", keywords="legislative and regulatory compliance", keywords="public engagement", keywords="public involvement", keywords="clinical research support", keywords="health record linkage supported research", keywords="genomics research support", doi="10.2196/medinform.5571", url="http://medinform.jmir.org/2016/2/e22/", url="http://www.ncbi.nlm.nih.gov/pubmed/27329087" } @Article{info:doi/10.2196/medinform.4756, author="Thilakanathan, Danan and Calvo, A. Rafael and Chen, Shiping and Nepal, Surya and Glozier, Nick", title="Facilitating Secure Sharing of Personal Health Data in the Cloud", journal="JMIR Med Inform", year="2016", month="May", day="27", volume="4", number="2", pages="e15", keywords="self care", keywords="telemedicine", keywords="privacy", keywords="computer security", keywords="information dissemination", abstract="Background: Internet-based applications are providing new ways of promoting health and reducing the cost of care. Although data can be kept encrypted in servers, the user does not have the ability to decide whom the data are shared with. Technically this is linked to the problem of who owns the data encryption keys required to decrypt the data. Currently, cloud service providers, rather than users, have full rights to the key. In practical terms this makes the users lose full control over their data. Trust and uptake of these applications can be increased by allowing patients to feel in control of their data, generally stored in cloud-based services. Objective: This paper addresses this security challenge by providing the user a way of controlling encryption keys independently of the cloud service provider. We provide a secure and usable system that enables a patient to share health information with doctors and specialists. Methods: We contribute a secure protocol for patients to share their data with doctors and others on the cloud while keeping complete ownership. We developed a simple, stereotypical health application and carried out security tests, performance tests, and usability tests with both students and doctors (N=15). Results: We developed the health application as an app for Android mobile phones. We carried out the usability tests on potential participants and medical professionals. Of 20 participants, 14 (70\%) either agreed or strongly agreed that they felt safer using our system. Using mixed methods, we show that participants agreed that privacy and security of health data are important and that our system addresses these issues. Conclusions: We presented a security protocol that enables patients to securely share their eHealth data with doctors and nurses and developed a secure and usable system that enables patients to share mental health information with doctors. ", doi="10.2196/medinform.4756", url="http://medinform.jmir.org/2016/2/e15/", url="http://www.ncbi.nlm.nih.gov/pubmed/27234691" } @Article{info:doi/10.2196/jmir.5011, author="Spencer, Karen and Sanders, Caroline and Whitley, A. Edgar and Lund, David and Kaye, Jane and Dixon, Gregory William", title="Patient Perspectives on Sharing Anonymized Personal Health Data Using a Digital System for Dynamic Consent and Research Feedback: A Qualitative Study", journal="J Med Internet Res", year="2016", month="Apr", day="15", volume="18", number="4", pages="e66", keywords="eHealth", keywords="data sharing", keywords="public trust", keywords="consent", abstract="Background: Electronic health records are widely acknowledged to provide an important opportunity to anonymize patient-level health care data and collate across populations to support research. Nonetheless, in the wake of public and policy concerns about security and inappropriate use of data, conventional approaches toward data governance may no longer be sufficient to respect and protect individual privacy. One proposed solution to improve transparency and public trust is known as Dynamic Consent, which uses information technology to facilitate a more explicit and accessible opportunity to opt out. In this case, patients can tailor preferences about whom they share their data with and can change their preferences reliably at any time. Furthermore, electronic systems provide opportunities for informing patients about data recipients and the results of research to which their data have contributed. Objective: To explore patient perspectives on the use of anonymized health care data for research purposes. To evaluate patient perceptions of a Dynamic Consent model and electronic system to enable and implement ongoing communication and collaboration between patients and researchers. Methods: A total of 26 qualitative interviews and three focus groups were conducted that included a video presentation explaining the reuse of anonymized electronic patient records for research. Slides and tablet devices were used to introduce the Dynamic Consent system for discussion. A total of 35 patients with chronic rheumatic disease with varying levels of illness and social deprivation were recruited from a rheumatology outpatient clinic; 5 participants were recruited from a patient and public involvement health research network. Results: Patients were supportive of sharing their anonymized electronic patient record for research, but noted a lack of transparency and awareness around the use of data, making it difficult to secure public trust. While there were general concerns about detrimental consequences of data falling into the wrong hands, such as insurance companies, 39 out of 40 (98\%) participants generally considered that the altruistic benefits of sharing health care data outweighed the risks. Views were mostly positive about the use of an electronic interface to enable greater control over consent choices, although some patients were happy to share their data without further engagement. Participants were particularly enthusiastic about the system as a means of enabling feedback regarding data recipients and associated research results, noting that this would improve trust and public engagement in research. This underlines the importance of patient and public involvement and engagement throughout the research process, including the reuse of anonymized health care data for research. More than half of patients found the touch screen interface easy to use, although a significant minority, especially those with limited access to technology, expressed some trepidation and felt they may need support to use the system. Conclusions: Patients from a range of socioeconomic backgrounds viewed a digital system for Dynamic Consent positively, in particular, feedback about data recipients and research results. Implementation of a digital Dynamic Consent system would require careful interface design and would need to be located within a robust data infrastructure; it has the potential to improve trust and engagement in electronic medical record research. ", doi="10.2196/jmir.5011", url="http://www.jmir.org/2016/4/e66/", url="http://www.ncbi.nlm.nih.gov/pubmed/27083521" } @Article{info:doi/10.2196/medinform.4640, author="Khazaei, Hamzeh and McGregor, Carolyn and Eklund, Mikael J. and El-Khatib, Khalil", title="Real-Time and Retrospective Health-Analytics-as-a-Service: A Novel Framework", journal="JMIR Med Inform", year="2015", month="Nov", day="18", volume="3", number="4", pages="e36", keywords="premature babies", keywords="physiological data", keywords="decision support system", keywords="analytics-as-a-service", keywords="cloud computing", keywords="big data, health informatics", keywords="real-time analytics", keywords="retrospective analysis", keywords="performance modeling", abstract="Background: Analytics-as-a-service (AaaS) is one of the latest provisions emerging from the cloud services family. Utilizing this paradigm of computing in health informatics will bene?t patients, care providers, and governments signi?cantly. This work is a novel approach to realize health analytics as services in critical care units in particular. Objective: To design, implement, evaluate, and deploy an extendable big-data compatible framework for health-analytics-as-a-service that offers both real-time and retrospective analysis. Methods: We present a novel framework that can realize health data analytics-as-a-service. The framework is flexible and con?gurable for different scenarios by utilizing the latest technologies and best practices for data acquisition, transformation, storage, analytics, knowledge extraction, and visualization. We have instantiated the proposed method, through the Artemis project, that is, a customization of the framework for live monitoring and retrospective research on premature babies and ill term infants in neonatal intensive care units (NICUs). Results: We demonstrated the proposed framework in this paper for monitoring NICUs and refer to it as the Artemis-In-Cloud (Artemis-IC) project. A pilot of Artemis has been deployed in the SickKids hospital NICU. By infusing the output of this pilot set up to an analytical model, we predict important performance measures for the ?nal deployment of Artemis-IC. This process can be carried out for other hospitals following the same steps with minimal effort. SickKids' NICU has 36 beds and can classify the patients generally into 5 different types including surgical and premature babies. The arrival rate is estimated as 4.5 patients per day, and the average length of stay was calculated as 16 days. Mean number of medical monitoring algorithms per patient is 9, which renders 311 live algorithms for the whole NICU running on the framework. The memory and computation power required for Artemis-IC to handle the SickKids NICU will be 32 GB and 16 CPU cores, respectively. The required amount of storage was estimated as 8.6 TB per year. There will always be 34.9 patients in SickKids NICU on average. Currently, 46\% of patients cannot get admitted to SickKids NICU due to lack of resources. By increasing the capacity to 90 beds, all patients can be accommodated. For such a provisioning, Artemis-IC will need 16 TB of storage per year, 55 GB of memory, and 28 CPU cores. Conclusions: Our contributions in this work relate to a cloud architecture for the analysis of physiological data for clinical decisions support for tertiary care use. We demonstrate how to size the equipment needed in the cloud for that architecture based on a very realistic assessment of the patient characteristics and the associated clinical decision support algorithms that would be required to run for those patients. We show the principle of how this could be performed and furthermore that it can be replicated for any critical care setting within a tertiary institution. ", doi="10.2196/medinform.4640", url="http://medinform.jmir.org/2015/4/e36/", url="http://www.ncbi.nlm.nih.gov/pubmed/26582268" } @Article{info:doi/10.2196/medinform.4221, author="Bettencourt-Silva, H. Joao and Clark, Jeremy and Cooper, S. Colin and Mills, Robert and Rayward-Smith, J. Victor and de la Iglesia, Beatriz", title="Building Data-Driven Pathways From Routinely Collected Hospital Data: A Case Study on Prostate Cancer", journal="JMIR Med Inform", year="2015", month="Jul", day="10", volume="3", number="3", pages="e26", keywords="hospital information systems", keywords="data summarization", keywords="clinical pathways", keywords="data quality", keywords="visualization", keywords="prostate cancer", keywords="electronic medical records", abstract="Background: Routinely collected data in hospitals is complex, typically heterogeneous, and scattered across multiple Hospital Information Systems (HIS). This big data, created as a byproduct of health care activities, has the potential to provide a better understanding of diseases, unearth hidden patterns, and improve services and cost. The extent and uses of such data rely on its quality, which is not consistently checked, nor fully understood. Nevertheless, using routine data for the construction of data-driven clinical pathways, describing processes and trends, is a key topic receiving increasing attention in the literature. Traditional algorithms do not cope well with unstructured processes or data, and do not produce clinically meaningful visualizations. Supporting systems that provide additional information, context, and quality assurance inspection are needed. Objective: The objective of the study is to explore how routine hospital data can be used to develop data-driven pathways that describe the journeys that patients take through care, and their potential uses in biomedical research; it proposes a framework for the construction, quality assessment, and visualization of patient pathways for clinical studies and decision support using a case study on prostate cancer. Methods: Data pertaining to prostate cancer patients were extracted from a large UK hospital from eight different HIS, validated, and complemented with information from the local cancer registry. Data-driven pathways were built for each of the 1904 patients and an expert knowledge base, containing rules on the prostate cancer biomarker, was used to assess the completeness and utility of the pathways for a specific clinical study. Software components were built to provide meaningful visualizations for the constructed pathways. Results: The proposed framework and pathway formalism enable the summarization, visualization, and querying of complex patient-centric clinical information, as well as the computation of quality indicators and dimensions. A novel graphical representation of the pathways allows the synthesis of such information. Conclusions: Clinical pathways built from routinely collected hospital data can unearth information about patients and diseases that may otherwise be unavailable or overlooked in hospitals. Data-driven clinical pathways allow for heterogeneous data (ie, semistructured and unstructured data) to be collated over a unified data model and for data quality dimensions to be assessed. This work has enabled further research on prostate cancer and its biomarkers, and on the development and application of methods to mine, compare, analyze, and visualize pathways constructed from routine data. This is an important development for the reuse of big data in hospitals. ", doi="10.2196/medinform.4221", url="http://medinform.jmir.org/2015/3/e26/", url="http://www.ncbi.nlm.nih.gov/pubmed/26162314" } @Article{info:doi/10.2196/jmir.3082, author="Tillmann, Taavi and Gibson, R. Alexander and Scott, Gregory and Harrison, Oliver and Dominiczak, Anna and Hanlon, Phil", title="Systems Medicine 2.0: Potential Benefits of Combining Electronic Health Care Records With Systems Science Models", journal="J Med Internet Res", year="2015", month="Mar", day="23", volume="17", number="3", pages="e64", keywords="gene-environment interaction", keywords="systems theory", keywords="electronic health records", keywords="epidemiology", keywords="online social networks", keywords="crowd-sourcing", keywords="Web 2.0", abstract="Background: The global burden of disease is increasingly dominated by non-communicable diseases.These diseases are less amenable to curative and preventative interventions than communicable disease. This presents a challenge to medical practice and medical research, both of which are experiencing diminishing returns from increasing investment. Objective: Our aim was to (1) review how medical knowledge is generated, and its limitations, (2) assess the potential for emerging technologies and ideas to improve medical research, and (3) suggest solutions and recommendations to increase medical research efficiency on non-communicable diseases. Methods: We undertook an unsystematic review of peer-reviewed literature and technology websites. Results: Our review generated the following conclusions and recommendations. (1) Medical knowledge continues to be generated in a reductionist paradigm. This oversimplifies our models of disease, rendering them ineffective to sufficiently understand the complex nature of non-communicable diseases. (2) Some of these failings may be overcome by adopting a ``Systems Medicine'' paradigm, where the human body is modeled as a complex adaptive system. That is, a system with multiple components and levels interacting in complex ways, wherein disease emerges from slow changes to the system set-up. Pursuing systems medicine research will require larger datasets. (3) Increased data sharing between researchers, patients, and clinicians could provide this unmet need for data. The recent emergence of electronic health care records (EHR) could potentially facilitate this in real-time and at a global level. (4) Efforts should continue to aggregate anonymous EHR data into large interoperable data silos and release this to researchers. However, international collaboration, data linkage, and obtaining additional information from patients will remain challenging. (5) Efforts should also continue towards ``Medicine 2.0''. Patients should be given access to their personal EHR data. Subsequently, online communities can give researchers the opportunity to ask patients for direct access to the patient's EHR data and request additional study-specific information. However, selection bias towards patients who use Web 2.0 technology may be difficult to overcome. Conclusions: Systems medicine, when combined with large-scale data sharing, has the potential to raise our understanding of non-communicable diseases, foster personalized medicine, and make substantial progress towards halting, curing, and preventing non-communicable diseases. Large-scale data amalgamation remains a core challenge and needs to be supported. A synthesis of ``Medicine 2.0'' and ``Systems Science'' concepts into ``Systems Medicine 2.0'' could take decades to materialize but holds much promise. ", doi="10.2196/jmir.3082", url="http://www.jmir.org/2015/3/e64/", url="http://www.ncbi.nlm.nih.gov/pubmed/25831125" } @Article{info:doi/10.2196/mhealth.3694, author="Mergel, Ines", title="The Long Way From Government Open Data to Mobile Health Apps: Overcoming Institutional Barriers in the US Federal Government", journal="JMIR mHealth uHealth", year="2014", month="Dec", day="23", volume="2", number="4", pages="e58", keywords="mHealth", keywords="mobile apps", keywords="open data", keywords="prizes and challenges", abstract="Background: Government agencies in the United States are creating mobile health (mHealth) apps as part of recent policy changes initiated by the White House's Digital Government Strategy. Objective: The objective of the study was to understand the institutional and managerial barriers for the implementation of mHealth, as well as the resulting adoption pathways of mHealth. Methods: This article is based on insights derived from qualitative interview data with 35 public managers in charge of promoting the reuse of open data through Challenge.gov, the platform created to run prizes, challenges, and the vetting and implementation of the winning and vendor-created apps. Results: The process of designing apps follows three different pathways: (1) entrepreneurs start to see opportunities for mobile apps, and develop either in-house or contract out to already vetted Web design vendors; (2) a top-down policy mandates agencies to adopt at least two customer-facing mobile apps; and (3) the federal government uses a policy instrument called ``Prizes and Challenges'', encouraging civic hackers to design health-related mobile apps using open government data from HealthData.gov, in combination with citizen needs. All pathways of the development process incur a set of major obstacles that have to be actively managed before agencies can promote mobile apps on their websites and app stores. Conclusions: Beyond the cultural paradigm shift to design interactive apps and to open health-related data to the public, the managerial challenges include accessibility, interoperability, security, privacy, and legal concerns using interactive apps tracking citizen. ", doi="10.2196/mhealth.3694", url="http://mhealth.jmir.org/2014/4/e58/", url="http://www.ncbi.nlm.nih.gov/pubmed/25537314" } @Article{info:doi/10.2196/jmir.3871, author="Moseley, T. Edward and Hsu, J. Douglas and Stone, J. David and Celi, Anthony Leo", title="Beyond Open Big Data: Addressing Unreliable Research", journal="J Med Internet Res", year="2014", month="Nov", day="11", volume="16", number="11", pages="e259", keywords="open data", keywords="unreliable research", keywords="collaborative learning", keywords="knowledge discovery", keywords="peer review", keywords="research culture", doi="10.2196/jmir.3871", url="http://www.jmir.org/2014/11/e259/", url="http://www.ncbi.nlm.nih.gov/pubmed/25405277" } @Article{info:doi/10.2196/medinform.3447, author="Badawi, Omar and Brennan, Thomas and Celi, Anthony Leo and Feng, Mengling and Ghassemi, Marzyeh and Ippolito, Andrea and Johnson, Alistair and Mark, G. Roger and Mayaud, Louis and Moody, George and Moses, Christopher and Naumann, Tristan and Nikore, Vipan and Pimentel, Marco and Pollard, J. Tom and Santos, Mauro and Stone, J. David and Zimolzak, Andrew and ", title="Making Big Data Useful for Health Care: A Summary of the Inaugural MIT Critical Data Conference", journal="JMIR Med Inform", year="2014", month="Aug", day="22", volume="2", number="2", pages="e22", keywords="big data", keywords="open data", keywords="unreliable research", keywords="machine learning", keywords="knowledge creation", doi="10.2196/medinform.3447", url="http://medinform.jmir.org/2014/2/e22/" } @Article{info:doi/10.2196/medinform.3110, author="Celi, Anthony Leo and Zimolzak, J. Andrew and Stone, J. David", title="Dynamic Clinical Data Mining: Search Engine-Based Decision Support", journal="JMIR Med Inform", year="2014", month="Jun", day="23", volume="2", number="1", pages="e13", keywords="decision support", keywords="clinical informatics", keywords="big data", doi="10.2196/medinform.3110", url="http://medinform.jmir.org/2014/1/e13/", url="http://www.ncbi.nlm.nih.gov/pubmed/25600664" } @Article{info:doi/10.2196/medinform.2913, author="Wang, Weiqi and Krishnan, Eswar", title="Big Data and Clinicians: A Review on the State of the Science", journal="JMIR Med Inform", year="2014", month="Jan", day="17", volume="2", number="1", pages="e1", keywords="big data", keywords="database", keywords="medical informatics", keywords="clinical research", keywords="medicine", abstract="Background: In the past few decades, medically related data collection saw a huge increase, referred to as big data. These huge datasets bring challenges in storage, processing, and analysis. In clinical medicine, big data is expected to play an important role in identifying causality of patient symptoms, in predicting hazards of disease incidence or reoccurrence, and in improving primary-care quality. Objective: The objective of this review was to provide an overview of the features of clinical big data, describe a few commonly employed computational algorithms, statistical methods, and software toolkits for data manipulation and analysis, and discuss the challenges and limitations in this realm. Methods: We conducted a literature review to identify studies on big data in medicine, especially clinical medicine. We used differentcombinations of keywords to search PubMed, Science Direct, Web of Knowledge, and Google Scholar for literature of interestfrom the past 10 years. Results: This paper reviewed studies that analyzed clinical big data and discussed issues related to storage and analysis of this typeof data. Conclusions: Big data is becoming a common feature of biological and clinical studies. Researchers who use clinical big data face multiple challenges, and the data itself has limitations. It is imperative that methodologies for data analysis keep pace with our ability tocollect and store data. ", doi="10.2196/medinform.2913", url="http://www.medinform.jmir.org/2014/1/e1/", url="http://www.ncbi.nlm.nih.gov/pubmed/25600256" }