<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e79160</article-id><article-id pub-id-type="doi">10.2196/79160</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Code-Based Versus AutoML Methods for Pill Recognition in Clinical Settings: Comparative Performance Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ashraf</surname><given-names>Amir Reza</given-names></name><degrees>PharmD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>R&#x00E1;dli</surname><given-names>Rich&#x00E1;rd</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>V&#x00F6;r&#x00F6;sh&#x00E1;zi</surname><given-names>Zsolt</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fittler</surname><given-names>Andr&#x00E1;s</given-names></name><degrees>PharmD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Pharmaceutics, Faculty of Pharmacy, University of P&#x00E9;cs</institution><addr-line>R&#x00F3;kus utca 4</addr-line><addr-line>P&#x00E9;cs</addr-line><country>Hungary</country></aff><aff id="aff2"><institution>Image Processing Research Laboratory, University of Pannonia</institution><addr-line>Veszpr&#x00E9;m</addr-line><country>Hungary</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Njoku</surname><given-names>Amarachi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Aarts</surname><given-names>Jos</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Han</surname><given-names>Shengtong</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Amir Reza Ashraf, PharmD, PhD, Department of Pharmaceutics, Faculty of Pharmacy, University of P&#x00E9;cs, R&#x00F3;kus utca 4, P&#x00E9;cs, H-7624, Hungary, +36 72503650 ext 28841; <email>ashraf.amir.reza@pte.hu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>10</day><month>4</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e79160</elocation-id><history><date date-type="received"><day>16</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>12</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>17</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Amir Reza Ashraf, Rich&#x00E1;rd R&#x00E1;dli, Zsolt V&#x00F6;r&#x00F6;sh&#x00E1;zi, Andr&#x00E1;s Fittler. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 10.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e79160"/><abstract><sec><title>Background</title><p>Visual identification and verification of medications during dispensing and administration are prone to human error, particularly in high-pressure and high-volume clinical settings. Misidentification can lead to medication errors, posing risks to patient safety and placing a burden on health care systems. Recent advances in computer vision and object detection offer promising solutions for automated solid oral dosage form (pill) recognition. However, comprehensive studies comparing code-based and no-code (automated machine learning [AutoML]) approaches for pill recognition are lacking.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate and compare the performance, cost, usability, and deployment feasibility of pill recognition models developed with Ultralytics YOLO11 and 3 cloud-based AutoML platforms (Amazon Rekognition Custom Labels, Google Vertex artificial intelligence [AI] AutoML Vision, and Microsoft Azure Custom Vision) using multiple datasets, including real-world clinical images.</p></sec><sec sec-type="methods"><title>Methods</title><p>Five training subsets of increasing size (1230, 3450, 7380, 14,400, and 26,880 images) from 30 commonly dispensed medications were used to train models on YOLO11 and 3 AutoML platforms. Models were evaluated on 6 datasets from different environments: clinical images from 3 hospitals, a verification dataset, a laboratory dataset, and an exhaustive testing set. Performance metrics, including accuracy, precision, recall, and mean average precision, were calculated. We evaluated the impact of training data size on performance and benchmarked training time, platform costs, and limitations.</p></sec><sec sec-type="results"><title>Results</title><p>No single platform dominated across all test environments. On the verification dataset (optimal conditions), accuracy ranged from 80.83% (YOLO11) to 91.60% (Google Vertex AI) when trained with the full training dataset. YOLO11 showed consistent performance improvement with increasing training data (accuracy: 63.06%-80.83%) and achieved near-perfect precision and mean average precision scores (0.95&#x2010;1.00). Google Vertex AI reached above 90% accuracy on 3 training subsets but showed unpredictable declines. Amazon Rekognition maintained near-perfect precision (0.92&#x2010;1.00) but had the highest false negative rates (up to 0.74), missing many pills. Custom Vision demonstrated steady performance improvements (77.08%-85.62% accuracy) but lagged behind other AutoML platforms, probably due to its older YOLOv2-based architecture. On clinical datasets, accuracy fluctuated (20.62%-90%) depending on the dataset and platform. Training costs and time varied: YOLO11 (open-source), Microsoft Azure (US $9.50-US $28.60, allowed user-predefined training duration), Google Vertex AI (US $69.30 with consistent 2.5&#x2010;3-hour training times), and Amazon Rekognition (US $5.43-US $43.89 with size-dependent training time scaling, reaching nearly 40 hours on the full 26,880-image dataset).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Each platform offers distinct advantages and trade-offs: YOLO11 provides the highest flexibility and lowest platform costs but requires technical expertise, while AutoML platforms can offer high performance at a higher cost but with limited user control, introducing unpredictability. The performance variations demonstrate that successful clinical deployment requires careful platform selection based on specific performance requirements, budget constraints, and available technical resources, followed by rigorous validation using real-world, representative data to ensure patient safety in clinical workflows.</p></sec></abstract><kwd-group><kwd>pill recognition</kwd><kwd>clinical pharmacy</kwd><kwd>medication safety</kwd><kwd>medication errors</kwd><kwd>clinical decision support</kwd><kwd>You Only Look Once</kwd><kwd>YOLO11</kwd><kwd>automated machine learning</kwd><kwd>AutoML</kwd><kwd>computer vision</kwd><kwd>object detection</kwd><kwd>deep learning</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Medication errors are preventable failures in medical care that can potentially lead to patient harm and increased health care costs, with a global financial burden estimated at US $42 billion annually [<xref ref-type="bibr" rid="ref1">1</xref>]. These errors are responsible for an estimated 7000&#x2010;9000 deaths per year in the United States [<xref ref-type="bibr" rid="ref1">1</xref>] and contribute to 1708 deaths in England [<xref ref-type="bibr" rid="ref2">2</xref>], highlighting the urgent need for improved medication safety strategies. Medication errors can occur at various stages of therapy, from prescribing to dispensing and administration of medications [<xref ref-type="bibr" rid="ref3">3</xref>], and are among the most common causes of death [<xref ref-type="bibr" rid="ref4">4</xref>]. Medication therapy management in hospitals, clinics, and inpatient health care settings is often managed under the supervision of a clinical pharmacist, who is responsible for monitoring patients&#x2019; treatment process and identifying any discrepancies [<xref ref-type="bibr" rid="ref5">5</xref>]. During inpatient care, medication reconciliation, that is, the process in which pharmacists compare the medications a patient is currently taking or should be taking with newly ordered therapies, often requires visually distinguishing between numerous solid oral dosage forms (eg, pills, tablets, and capsules, often referred to collectively as &#x201C;pills&#x201D;) based on the patient&#x2019;s therapy regimen. This process can be challenging because these products often show only subtle differences in their physical characteristics [<xref ref-type="bibr" rid="ref6">6</xref>]. Therefore, even the most experienced professionals could occasionally make mistakes, as visual identification and verification are prone to human error, especially in high-pressure and high-volume environments such as hospitals and clinics [<xref ref-type="bibr" rid="ref7">7</xref>]. These challenges and limitations have led to investments in automation technologies for unit-dose dispensing to improve hospital pharmacy services [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] and the exploration of leveraging novel techniques, such as object detection, to assist pharmacists with verification tasks and ultimately reduce dispensing errors [<xref ref-type="bibr" rid="ref6">6</xref>]. However, it is important to note the specific clinical context for such tools. Unlike medication reconciliation at admission, where a patient may present a disorganized collection of loose pills for identification, during hospital pharmacy manual dispensing processes, staff typically use organized unit doses and dosette boxes. In this context, the primary challenge is usually not identifying a mystery pill but verifying that the pill in a specific compartment matches the prescribed therapy. Automated pill recognition systems could support this verification process, as well as additional workflows, including detection of dispensing errors before medications reach patients and identification of medications returned to the hospital pharmacy unit.</p><p>Recent advances in computer vision and object detection offer promising solutions for image analysis, including high-performance object detection models based on the You Only Look Once (YOLO) architecture, which allows fast and accurate localization and classification in a single step [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. YOLO has proven to be an attractive choice for a variety of medical applications [<xref ref-type="bibr" rid="ref11">11</xref>], including pill recognition [<xref ref-type="bibr" rid="ref12">12</xref>]. YOLO has demonstrated strong performance in code-based pill recognition systems, particularly for preprocessing segmented images within multistream, 2-phase neural models using metric embedding [<xref ref-type="bibr" rid="ref13">13</xref>]. One of the latest iterations in the Ultralytics YOLO series is YOLO11, released in September 2024. This version improved upon previous models by implementing a new backbone architecture that enhances feature extraction (identifying key patterns such as edges, textures, or shapes from raw image data) with improved accuracy and processing speed, making it suitable for complex visual recognition tasks [<xref ref-type="bibr" rid="ref14">14</xref>]. These advancements create new opportunities for developing clinical decision support systems that assist health care professionals in the visual identification of pills.</p><p>In parallel with the evolution of code-based models, which provide flexibility and customization but require strong programming skills and longer development time, low-code or code-free automated machine learning (AutoML) platforms have emerged. These platforms enable relatively simple development and testing of specialized object detection models using easy-to-use cloud-based platforms [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s1-2"><title>Rationale</title><p>Although both traditional code-based models and cloud-based AutoML platforms have shown promise in medical image analysis [<xref ref-type="bibr" rid="ref17">17</xref>], there is a lack of comprehensive, direct comparative studies evaluating the performance of these approaches in the context of pill recognition. Previous studies have commonly focused on controlled environments that do not reflect typical usage scenarios and conditions. The open-source, code-based deep learning object detection YOLO model and code-free, cloud-based AutoML platforms represent fundamentally different development concepts, each with its own associated advantages and limitations. This justifies a comprehensive comparison of these solutions to determine robustness, scalability, and implementation complexity in practical environments. Limited published, peer-reviewed studies compare the performance of traditional code-based and code-free pill recognition models using images captured in real-world clinical environments, which often differ substantially from standardized reference images. To our knowledge, this is the first study to directly compare these approaches using clinical data under such specific conditions.</p></sec><sec id="s1-3"><title>Objectives</title><p>This study aimed to evaluate and compare the effectiveness of pill recognition models developed using YOLO11 with cloud-based AutoML platforms provided by Amazon Rekognition Custom Labels, Google Vertex artificial intelligence (AI) AutoML Vision, and Microsoft Azure Custom Vision. Factors, such as cost, ease of use, and deployment feasibility, as well as key performance metrics including accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score, mean average precision (mAP), overall error rates (OERs), and false negative rates (FNRs), were evaluated on different test sets, including real-world images captured by clinical pharmacists.</p></sec><sec id="s1-4"><title>Code-Based and Code-Free Object Detection</title><p>Traditional code-based deep learning frameworks, such as YOLO11, require programming and machine learning (ML) expertise but offer full control over model architecture and training parameters, whereas code-free development and training using cloud-based AutoML platforms provide simplified workflows with limited customization options. The YOLO family of deep-learning&#x2013;based object detectors uses a single-stage approach, which means that the entire input image is processed in a single forward pass, predicting bounding boxes and class probabilities simultaneously [<xref ref-type="bibr" rid="ref18">18</xref>]. This allows YOLO models to balance inference speed and detection accuracy, making them one of the most widely used object detection algorithms. In our experiments, we used the YOLO11 real-time object detection model, which was designed to address a wide spectrum of application requirements and supports a wide range of computer vision tasks, including object detection [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. A detailed description of the YOLO11 model structure is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>In a traditional deep learning ecosystem, model development involves several manual and iterative processes, such as data collection and preprocessing, feature engineering, model selection, hyperparameter tuning, training, and validation, as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. This approach requires a high level of expertise and intensive experimentation to fine-tune the model&#x2019;s performance.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Traditional machine learning (ML) vs automated machine learning (AutoML) workflows.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e79160_fig01.png"/></fig><p>Cloud-based, no-code AutoML platforms have emerged as suitable alternatives to traditional, often code-intensive model development approaches. These platforms automate most of the workflows involved in managing ML pipelines, thereby lowering the barrier to entry and making model development accessible even to those without programming skills or data science expertise [<xref ref-type="bibr" rid="ref19">19</xref>]. In addition to enhancing accessibility, these platforms accelerate the model development lifecycle by automating repetitive and time-consuming tasks, allowing rapid prototyping, while reducing complexity by handling the underlying tasks associated with managing the computational infrastructure and ecosystem required for model development [<xref ref-type="bibr" rid="ref19">19</xref>]. However, AutoML has inherent disadvantages. While AutoML democratizes model development by minimizing human effort, it requires significant computational resources, which can lead to expensive cloud service costs, particularly when developing models iteratively using large datasets. AutoML users typically have very limited control over the choice of base models, underlying training algorithms, and hyperparameter adjustments. In addition, AutoML platforms lack transparency in model architecture and hyperparameters and may raise privacy concerns [<xref ref-type="bibr" rid="ref20">20</xref>]. Therefore, the choice of the optimal platform depends on a variety of factors.</p><p>This study aimed to provide empirical evidence to guide such platform selection decisions for pill recognition model development intended for deployment in clinical and hospital pharmacy settings, where accuracy and reliability are of utmost importance. We evaluated AutoML solutions offered by Amazon Web Services (AWS), Google Cloud, and Microsoft Azure, 3 of the most popular cloud-based providers for object detection, alongside code-based models trained with a traditional workflow using YOLO11.</p><p>Microsoft offers a wide range of AI and ML services on its Azure platform, including Custom Vision, a more specialized service focused on building, training, and deploying custom image classification and object detection models using a no-code or low-code approach [<xref ref-type="bibr" rid="ref21">21</xref>]. It is accessible through a dedicated, code-free web interface and software development kits (SDKs) requiring minimal coding experience. Vertex AI AutoML Vision is part of Google Cloud&#x2019;s unified Vertex AI platform. Vertex AI allows users to train, deploy, and fine-tune a wide range of models with supervised learning, data- and application-specific training strategies, and tools to manage the entire lifecycle of ML [<xref ref-type="bibr" rid="ref22">22</xref>]. Vertex AI offers choices for a manual approach that provides full control over the training pipeline using traditional coding development, as well as AutoML for code-free model development, which is accessible via the Google Cloud Console web interface and through SDKs compatible with popular open-source libraries such as TensorFlow (Google) and PyTorch (Meta AI) [<xref ref-type="bibr" rid="ref23">23</xref>]. Amazon Rekognition Custom Labels is a feature within AWS&#x2019;s broader Rekognition service that allows users to train custom object detection models using AutoML [<xref ref-type="bibr" rid="ref24">24</xref>]. Although Amazon Rekognition follows a similar workflow to other platforms, including the option of manual labeling, it imposes stricter restrictions on the ML workflow.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This section details the methodologies used for data collection, preprocessing, and coded model development. We compared 2 fundamentally different approaches to model development and training: traditional code-based deep learning using YOLO11, and code-free development using 3 major cloud-based AutoML platforms (Amazon Rekognition Custom Labels, Vertex AI AutoML Vision, and Custom Vision). We performed 5 training runs per platform using 5 progressively larger subsets of our training images. The evaluation metrics used for benchmarking and comparative analysis are explained in detail in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec><sec id="s2-2"><title>Data Collection and Preprocessing</title><p>For model training and evaluation, we used a tailor-made proprietary dataset focusing on 30 commonly dispensed solid oral dosage forms in Hungary (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Selection of medications and curation of the dataset followed a strict protocol described in our previous publication [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>For testing datasets, images were captured with a single pill placed in blue-and-white dosette boxes routinely used for manual dispensing in Hungarian clinical settings, except for one dataset prepared for exhaustive performance evaluation, which contained multiple pills per image (refer to <xref ref-type="fig" rid="figure3">Figure 3</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The 30 medications selected for this study<italic>.</italic></p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e79160_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Illustration of testing datasets<italic>.</italic></p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e79160_fig03.png"/></fig><p>Training images had a resolution of 1200&#x00D7;900 pixels, while test images were at native device resolution (4032&#x00D7;3024 and 4000&#x00D7;3000 pixels) to better simulate real-world conditions where input images vary in size. All images were captured in JPEG format with consistent quality settings.</p><p>The study&#x2019;s datasets were organized as follows:</p><list list-type="bullet"><list-item><p>The &#x201C;full training dataset&#x201D; comprised 26,880 images in total, provided by the University of P&#x00E9;cs, with 896 images per class for the 30 selected medications. To evaluate the impact of training data size on the model performance, 5 subsets were created from these images using a logarithmic binning strategy, each with a progressively increasing number of images:</p><list list-type="bullet"><list-item><p>Subset 1: 1230 images (41 images/class)</p></list-item><list-item><p>Subset 2: 3450 images (115 images/class)</p></list-item><list-item><p>Subset 3: 7380 images (246 images/class)</p></list-item><list-item><p>Subset 4: 14,400 images (480 images/class)</p></list-item><list-item><p>Subset 5: 26,880 images (full training dataset, 896 images/class)</p></list-item></list></list-item><list-item><p>A &#x201C;verification dataset<italic>&#x201D;</italic> containing 1440 images (48/class) was created at the University of P&#x00E9;cs using the same location, conditions, and procedures as the training dataset images.</p></list-item><list-item><p>Three <italic>&#x201C;</italic>uncontrolled clinical test sets (UCTSs)&#x201D; of 480 images each (16/class) were captured by clinical pharmacists at Kaposv&#x00E1;r, Koml&#x00F3;, and P&#x00E9;cs hospitals under authentic clinical conditions to assess model performance in uncontrolled health care environments. UCTS images were collected using purposive sampling to ensure balanced representation across all medication classes (16 images/class at each hospital site). This design enables fair per-class comparison of model performance, avoiding bias toward majority classes seen with imbalanced test sets. Similar balanced sampling approaches are used in established pill recognition benchmarks, such as the CURE dataset [<xref ref-type="bibr" rid="ref25">25</xref>] and the National Library of Medicine Pill Image Recognition Challenge [<xref ref-type="bibr" rid="ref26">26</xref>]. To create the dataset, clinical pharmacists photographed medications within their routine workspaces using the dosette boxes and imaging equipment available in their daily practice. Class identity recorded during image acquisition served as the ground truth label during subsequent evaluations. This protocol preserved authentic environmental and human variables affecting input image quality, including camera variability, lighting conditions, and operator technique, while maintaining balanced class representation for fair platform comparison. No image quality filtering or manipulation was applied; all captured images were included regardless of quality to ensure the dataset accurately reflected authentic real-world variability in clinical settings.</p></list-item><list-item><p>Two additional test datasets were created in a controlled laboratory environment at the Image Processing Laboratory, University of Pannonia. The first dataset, referred to as &#x201C;laboratory test set,<italic>&#x201D;</italic> contained 120 images with single pills per image, while the second dataset included 120 multipill images called &#x201C;exhaustive test set&#x201D; intended for exhaustive model testing. Medications were arranged in a white dosette box, and controllable top-mounted lighting was used to evenly illuminate the pills. Images were captured with a fixed camera position at 3840&#x00D7;2160 resolution, with a slight reduction in resolution following image undistortion.</p></list-item></list></sec><sec id="s2-3"><title>YOLO11 Implementation</title><p>For our experiments, we selected the medium-sized pretrained model (YOLO11-m), which balances speed, accuracy, and training time. All hyperparameters remained at their default values, except for the early stopping patience parameter, which was set to 15 epochs to terminate training if the validation loss did not improve within 15 consecutive epochs. Standard data augmentation techniques (eg, rotation, brightness adjustment, and so on) were not applied during training to ensure uniform conditions across platforms, as cloud-based AutoML platforms typically apply such augmentations automatically, with limited or no user control. All experiments ran for 100 epochs with a batch size of 20, using an NVIDIA Quadro RTX 5000 graphics processing unit (GPU) with 16 GB video random-access memory for both training and evaluation.</p></sec><sec id="s2-4"><title>Cloud-Based AutoML Platform Implementation</title><sec id="s2-4-1"><title>Microsoft Azure Custom Vision</title><p>Custom Vision offers a streamlined workflow for training object detection models. Users must create Custom Vision training and prediction resources in their Azure subscription via the Azure portal. New object detection projects can be created through the Custom Vision portal or via SDK, which offers a choice of predefined domains optimized for specific scenarios and edge or mobile deployment. Training images can be uploaded directly via the web portal or SDK, with an integrated tagging interface for drawing bounding boxes and assigning labels. Custom Vision requires a minimum of 15 images per label for training, but recommends 50 or more images per label for better performance. Users can select either Quick training (completed in minutes) or Advanced training (with extended computation budget) [<xref ref-type="bibr" rid="ref27">27</xref>]. The platform automatically selects the optimal base model, training, tuning, and augmentation settings based on uploaded data and the selected project domain. Each training run generates a new model iteration that can be published to a prediction endpoint accessible via Representational State Transfer application programming interfaces (APIs) or SDKs.</p></sec><sec id="s2-4-2"><title>Google Vertex AI AutoML</title><p>Vertex AI requires an active Google Cloud project with images stored in Google Cloud Storage. The platform accepts various image formats, with data imported either from local machines or directly from Google Cloud. Annotations must be provided in comma-separated values or JSON Lines (JSONL) formats containing bounding box coordinates and corresponding labels. Vertex AI requires at least 10 annotated images per label for training and recommends providing 1000 or more for optimal model performance [<xref ref-type="bibr" rid="ref28">28</xref>]. For unannotated datasets, users can apply annotations through the Google Cloud Console interface. Without user-specified splitting, Vertex AI automatically partitions datasets using an 80%&#x2010;10%&#x2010;10% split for training, validation, and testing [<xref ref-type="bibr" rid="ref29">29</xref>]. Since our original XML annotations were incompatible with Vertex AI&#x2019;s input requirements, we converted them to a single JSONL file, with each line representing an annotated instance. We created the dataset by selecting object detection as the project objective and linking the JSONL file from a cloud storage bucket. Vertex AI&#x2019;s custom dataset splitting functionality was used to create appropriate training, validation, and testing sets. For optimization, we used the high-accuracy training option within Vertex AI. After performing initial experiments, we opted for a specific high-accuracy training configuration that significantly outperformed other available options.</p></sec><sec id="s2-4-3"><title>Amazon Rekognition Custom Labels</title><p>Rekognition requires prelabeled data in Amazon SageMaker Ground Truth manifest format, a JSONL structure with pixel-based bounding box coordinates [<xref ref-type="bibr" rid="ref30">30</xref>]. This format has a different internal structure compared to Vertex AI&#x2019;s requirements, necessitating the conversion of our annotations using Python scripts. Data management in Rekognition offers moderate flexibility; however, Amazon S3 is the mandatory cloud solution for image storage. If no test dataset is provided, Rekognition automatically splits the training data to create a test dataset using an 80%&#x2010;20% split [<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>Model training can be initiated through the AWS Console or the CreateProjectVersion API. However, Rekognition provides no hyperparameter control, limiting customization during training. Each successful training run generates a new project version with an associated Amazon Resource Name. Performance metrics, including <italic>F</italic><sub>1</sub>-score, precision, recall, and mAP, are available through the API and Rekognition.</p></sec></sec><sec id="s2-5"><title>Evaluation Metrics</title><p>Object detection evaluation differs from standard classification because it requires assessment of localization and classification, meaning models must simultaneously locate objects within an image and correctly identify each object. Therefore, metrics that ignore spatial accuracy of predictions, such as simple classification accuracy, precision, and recall without intersection over union (IoU) thresholds, or classification-only <italic>F</italic><sub>1</sub>-scores, are insufficient for rigorous model evaluation.</p><p>To comprehensively assess our pill recognition models for both localization precision and classification accuracy, we used metrics incorporating IoU to enable fair comparison across fundamentally different model architectures and development approaches.</p><p>IoU quantifies the spatial accuracy of object detection by measuring the overlap between predicted and ground-truth bounding boxes, calculated as the ratio of overlap area to union area between boxes. Higher values indicate better localization accuracy, with a threshold (typically 0.5) determining whether a detection is considered correct.</p><p>In object detection, the accuracy metric is adapted to account for both classification and localization performance. A prediction is considered correct only if it has the correct class label and the IoU with the ground-truth bounding box exceeds a specified threshold.</p><p>Precision and recall incorporate IoU thresholds for object detection. Precision measures the percentage of correct detections among all predictions, indicating how reliable the model&#x2019;s positive predictions are. Recall measures the percentage of ground-truth objects correctly detected, reflecting the model&#x2019;s ability to identify all relevant instances.</p><p>Average precision represents the area under the precision-recall curve, providing a comprehensive performance measure across different confidence thresholds for each class. mAP averages average precision values across all classes, providing an overall performance measure. Common variants include mAP@0.50 (using an IoU threshold of 0.5) and mAP@0.50&#x2010;0.95 (averaging across multiple IoU thresholds from 0.5 to 0.95 in 0.05 increments), which provides a more rigorous evaluation.</p><p>Beyond standard performance metrics, a critical evaluation of model suitability for clinical deployment necessitates an analysis of error profiles. To understand the failures impacting patient safety, we analyzed OERs and FNRs for each model. OER and accuracy are inversely related; they measure the model&#x2019;s overall tendency to misidentify medications through missed detections or incorrect classifications. FNR specifically quantifies the proportion of pills that models fail to detect, a critical safety metric in clinical settings. A detailed description of evaluation metrics, mathematical definitions, and computational procedures is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec><sec id="s2-6"><title>Analytical Approach</title><p>This study used a comparative benchmarking design to characterize platform performance across diverse deployment scenarios rather than hypothesis testing. We selected standard object detection metrics following established conventions in computer vision and pill recognition research to enable direct comparison with other published literature. We additionally reported OER and FNR due to their clinical relevance for patient safety. CIs and formal statistical comparisons across hospital sites were not performed, as the evaluated cloud-based AutoML platforms have major inherent limitations and do not consistently provide the data needed for these calculations.</p></sec><sec id="s2-7"><title>Used Medicines</title><p>Our datasets included the 30 most commonly dispensed medications from 3 participating clinical centers [<xref ref-type="bibr" rid="ref15">15</xref>]. The medications represent various therapeutic areas, including cardiovascular agents (eg, bisoprolol and perindopril), antibiotics (amoxicillin/clavulanic acid), analgesics (naproxen and tramadol), psychiatric medications (alprazolam and quetiapine), and essential vitamins and supplements.</p><p>These medications cover a diverse range of solid oral dosage forms, including film-coated and uncoated tablets (the most prevalent forms), enteric-coated tablets, hard and soft gelatin capsules, and one chewable tablet. A complete medication list with active pharmaceutical ingredients, dosage forms, distinctive features, colors, and shapes is provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This study did not involve human participants, and ethics approval was therefore not required. No personally identifiable information was collected or processed. No informed consent was required, as no human participants were involved in this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overall Detection Performance</title><p>Our evaluation of pill recognition models across multiple test datasets revealed notable performance differences, with no single platform consistently dominating all test scenarios. We trained models using 5 progressively larger training subsets (from 1230 to 26,880 images) and evaluated them using standard metrics: accuracy, precision, recall, OER, FNR, <italic>F</italic><sub>1</sub>-score, mAP@0.50, and mAP@0.50&#x2010;0.95 to comprehensively assess platform performance, strengths, and limitations. <xref ref-type="table" rid="table1">Table 1</xref> presents the results of models trained on the full dataset of 26,880 images, with comprehensive results of all training runs and subsequent evaluations provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of platform performance across all testing datasets (trained with the full training dataset of 26,880 images).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Platform and dataset</td><td align="left" valign="bottom">Accuracy (%)</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">OER<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">FNR<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">mAP@0<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>.50</td><td align="left" valign="bottom">mAP@0.50&#x2010;0.95</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">Ultralytics YOLO11</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Kaposv&#x00E1;r</td><td align="left" valign="top">80.63</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.08</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.62</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Koml&#x00F3;</td><td align="left" valign="top">78.96</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.21</td><td align="left" valign="top">0.09</td><td align="left" valign="top">0.95</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.72</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>P&#x00E9;cs</td><td align="left" valign="top">64.58</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.09</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.76</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Verification</td><td align="left" valign="top">80.83</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.04</td><td align="left" valign="top">0.98</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.89</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Laboratory</td><td align="left" valign="top">40.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.87</td><td align="left" valign="top">0.60</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.48</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Exhaustive</td><td align="left" valign="top">44.10</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.40</td></tr><tr><td align="left" valign="top" colspan="9">Google Vertex AI<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Kaposv&#x00E1;r</td><td align="left" valign="top">71.04</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.29</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.53</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Koml&#x00F3;</td><td align="left" valign="top">90.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.04</td><td align="left" valign="top">0.98</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.58</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>P&#x00E9;cs</td><td align="left" valign="top">65.42</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Verification</td><td align="left" valign="top">91.60</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.08</td><td align="left" valign="top">0.02</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.79</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Laboratory</td><td align="left" valign="top">67.50</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.14</td><td align="left" valign="top">0.92</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.54</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Exhaustive</td><td align="left" valign="top">52.79</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.66</td><td align="left" valign="top">0.47</td><td align="left" valign="top">0.34</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.45</td></tr><tr><td align="left" valign="top" colspan="9">Microsoft Azure Custom Vision</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Kaposv&#x00E1;r</td><td align="left" valign="top">62.71</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.20</td><td align="left" valign="top">0.88</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.58</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Koml&#x00F3;</td><td align="left" valign="top">77.71</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.22</td><td align="left" valign="top">0.08</td><td align="left" valign="top">0.95</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>P&#x00E9;cs</td><td align="left" valign="top">56.04</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.44</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.65</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Verification</td><td align="left" valign="top">85.62</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.14</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.97</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.70</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Laboratory</td><td align="left" valign="top">51.67</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.48</td><td align="left" valign="top">0.22</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.59</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Exhaustive</td><td align="left" valign="top">33.61</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.66</td><td align="left" valign="top">0.46</td><td align="left" valign="top">0.68</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.33</td></tr><tr><td align="left" valign="top" colspan="9">Amazon Rekognition</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Kaposv&#x00E1;r</td><td align="left" valign="top">70.63</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.29</td><td align="left" valign="top">0.24</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.45</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Koml&#x00F3;</td><td align="left" valign="top">77.71</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.22</td><td align="left" valign="top">0.20</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.53</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>P&#x00E9;cs</td><td align="left" valign="top">20.62</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.19</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Verification</td><td align="left" valign="top">84.72</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.15</td><td align="left" valign="top">0.15</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.71</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Laboratory</td><td align="left" valign="top">43.33</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.20</td><td align="left" valign="top">0.89</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.77</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Exhaustive</td><td align="left" valign="top">41.97</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.24</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.67</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>OER: overall error rate.</p></fn><fn id="table1fn2"><p><sup>b</sup>FNR: false negative rate. </p></fn><fn id="table1fn3"><p><sup>c</sup>mAP: mean average precision. </p></fn><fn id="table1fn4"><p><sup>d</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap><p>Results are organized into 3 categories: UCTS (representing real-world hospital conditions), verification dataset (optimal imaging conditions replicating training images), and laboratory-controlled datasets (standardized research settings).</p><p>On the UCTS images from Kaposv&#x00E1;r hospital, models trained with the full training dataset of 26,880 images exhibited varied performance (<xref ref-type="table" rid="table1">Table 1</xref>). YOLO11 demonstrated superiority, achieving the highest scores in accuracy (80.63%), recall (0.92), <italic>F</italic><sub>1</sub>-score (0.95), and mAP metrics. Amazon Rekognition achieved the highest precision (1) but also showed the highest FNR, indicating its conservative detection strategy. Vertex AI ranked second with 71.04% accuracy. Microsoft Azure did not achieve top results in any instance on this dataset, but it maintained consistent performance, with accuracy ranging from 56.87% to 62.92% across training set sizes (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p>On the UCTS images from Koml&#x00F3; hospital, competition was balanced between YOLO11, Custom Vision, and Amazon Rekognition, while Vertex AI showed considerably higher performance compared to the others. Across all training configurations, YOLO11 dominated mAP metrics, achieving a perfect 1.0 mAP@0.50 with 14,400 and 26,880 training images. Vertex AI excelled in accuracy, reaching 90.62% with 7380 images and 90% with the full training subset of 26,880 images. Custom Vision improved steadily from 69.58% to 77.71% accuracy with increasing training data, while Amazon Rekognition maintained perfect precision (1.0) across all 5 training scenarios while achieving 77.71% accuracy.</p><p>On the UCTS images from P&#x00E9;cs hospital, Vertex AI outperformed other platforms despite varying results, achieving 79.79% accuracy with the training subset of 14,400 images but 65.42% with the full dataset. The other AutoML solutions performed poorly: Microsoft Azure achieved 57.5% accuracy with the training subset of 14,400 images and 56.04% with the full training dataset, while Amazon Rekognition reached only as high as 27.08% accuracy with the training subset of 7380 images on this dataset, which fell to 20.62% when trained with the full training dataset (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). YOLO11 also showed strong results, particularly for the more challenging mAP@0.50&#x2010;0.95 threshold metric (0.76). This dataset demonstrated the challenging nature of certain clinical environments, where lighting conditions, camera positioning, and other external factors significantly impact model performance.</p><p>On the verification dataset from the University of P&#x00E9;cs, Vertex AI excelled across all metrics, achieving 91.60% accuracy, perfect 1.0 precision and mAP@0.50, and 0.98 recall with the full training dataset. It only slightly underperformed YOLO11 in the mAP@0.50&#x2010;0.95 metric (0.79 vs 0.89). Microsoft Azure achieved 85.62% accuracy, 1.0 precision, and 1.0 mAP@0.50, while Amazon Rekognition reached 84.72% accuracy with perfect precision (1.0) but lower recall (0.85). YOLO11 showed linear performance improvement with increasing training dataset size, reaching 80.83% accuracy with perfect precision (1.0) and a high recall of 0.96. This dataset demonstrated the potential upper bound on performance when testing conditions are well-matched to training data.</p><p>The laboratory-controlled (single pills) and the exhaustive (multipills) test sets from the University of Pannonia showed varying performance across different platforms under optimal imaging conditions in a fully controlled environment. When analyzing the exhaustive dataset, Vertex AI achieved the highest accuracy on the full training dataset (52.79%) with excellent mAP@0.50 (0.99), followed by YOLO11 (44.10%, 0.95), Amazon Rekognition (41.97%, 0.98), and Microsoft Azure (33.61%, 0.77). The lower accuracy scores in the exhaustive dataset reflect the increased complexity of detecting multiple objects within an image, with potential occlusions and varying orientations. On the laboratory dataset, Vertex AI achieved 67.50% accuracy with perfect 1.0 precision and mAP@0.50, followed by Amazon Rekognition, mainly due to its robust mAP metrics, which maintained good performance across other metrics, although still falling short of Vertex AI&#x2019;s performance. <xref ref-type="fig" rid="figure4">Figure 4</xref> visualizes these trends across all datasets and training set sizes, illustrating nonlinear performance patterns and platform-specific variations.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Accuracy of object detection models across datasets. AI: artificial intelligence; AWS: Amazon Web Services; UCTS: uncontrolled clinical test set.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e79160_fig04.png"/></fig></sec><sec id="s3-2"><title>Influence of Training Dataset Size on Model Performance</title><p>Increasing training images generally improved model performance across all metrics, although improvements were nonlinear with platform-specific patterns. <xref ref-type="table" rid="table2">Table 2</xref> illustrates these effects using the verification dataset, which, despite being captured under optimal conditions similar to training data, serves as a neutral baseline for comparing platform responses to increased training data because all models were trained and tested using the same datasets. Overall, YOLO11 and Custom Vision showed predictable linear improvement, with YOLO11&#x2019;s accuracy increasing from 63.06% to 80.83% and Microsoft Azure&#x2019;s from 77.08% to 85.62% as training data increased from 1230 to 26,880 images. Conversely, Amazon Rekognition and Vertex AI showed variable patterns with several fluctuations, suggesting dynamic model selection based on dataset characteristics, resulting in performance variations that do not follow the expected logarithmic improvement pattern with an eventual plateau typically seen in ML models. Notably, Vertex AI achieved near-optimal performance (94.37% accuracy, 1.0 precision, and 1.0 recall) with the smallest training subset at just 1230 images, though challenging clinical datasets required the full training set for acceptable performance.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Effect of training dataset size on model performance on the verification dataset.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Platform and training subset</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">OER<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">FNR<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">mAP@0<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup>.50</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="8">Ultralytics YOLO11</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">63.06%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.15</td><td align="left" valign="top">0.92</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">67.36%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.14</td><td align="left" valign="top">0.93</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">68.89%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.31</td><td align="left" valign="top">0.09</td><td align="left" valign="top">0.95</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">77.85%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.22</td><td align="left" valign="top">0.04</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.95</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">80.83%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.04</td><td align="left" valign="top">0.98</td><td align="left" valign="top">1.00</td></tr><tr><td align="left" valign="top" colspan="8">Google Vertex AI<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">94.37%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">83.06%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.17</td><td align="left" valign="top">0.02</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">94.03%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.01</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">85.97%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.14</td><td align="left" valign="top">0.02</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">91.60%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.08</td><td align="left" valign="top">0.02</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.00</td></tr><tr><td align="left" valign="top" colspan="8">Microsoft Azure Custom Vision</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">77.08%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.23</td><td align="left" valign="top">0.09</td><td align="left" valign="top">0.95</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">79.93%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.20</td><td align="left" valign="top">0.08</td><td align="left" valign="top">0.96</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">80.76%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.95</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">82.22%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.18</td><td align="left" valign="top">0.07</td><td align="left" valign="top">0.96</td><td align="left" valign="top">1.00</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">85.62%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.14</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.97</td><td align="left" valign="top">1.00</td></tr><tr><td align="left" valign="top" colspan="8">Amazon Rekognition</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">66.74%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.68</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.32</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.72</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">72.15%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.28</td><td align="left" valign="top">0.25</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.79</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">81.87%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.84</td><td align="left" valign="top">0.18</td><td align="left" valign="top">0.16</td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.84</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">79.24%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.21</td><td align="left" valign="top">0.19</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.82</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">84.72%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.15</td><td align="left" valign="top">0.15</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.85</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>OER: overall error rate.</p></fn><fn id="table2fn2"><p><sup>b</sup>FNR: false negative rate.</p></fn><fn id="table2fn3"><p><sup>c</sup>mAP: mean average precision.</p></fn><fn id="table2fn4"><p><sup>d</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Cost, Usability, and Limitations of AutoML Platforms</title><p>Several important pricing and usability differences exist between platforms. Vertex AI and Amazon Rekognition Custom Labels charge continuously for endpoint availability, accumulating costs in minute increments even during idle periods. In contrast, Custom Vision charges only per request, eliminating costs during idle periods. Vertex AI maintains consistent flat-rate pricing regardless of the dataset size, charging US $3.465 per hour for training and US $2.002 per hour for deployment and online prediction [<xref ref-type="bibr" rid="ref32">32</xref>]. Amazon Rekognition Custom Labels charges US $1 per hour for training and US $4 per hour for inference, but scales training time and resources with dataset size, resulting in proportionally higher costs for larger datasets [<xref ref-type="bibr" rid="ref33">33</xref>]. Custom Vision charges US $10 per hour for training, but allows users to specify computation time budgets, providing better cost control. Inference costs US $2 per 1000 individual requests [<xref ref-type="bibr" rid="ref34">34</xref>]. While Google and Amazon have regional pricing variations, Azure maintains uniform US dollar pricing that is directly converted to local currencies. Each platform offers limited free tiers: Azure provides 1 hour of free training per month plus 10,000 predictions; Google provides US $300 in free credit for new customers and includes 30 minutes of free online predictions; Amazon offers 1 free inference hour per month during its 12-month free tier. YOLO11, being open-source, has no platform costs but requires local computational resources.</p><p>Training times varied considerably. YOLO11 training times scaled linearly with dataset size, from 38 minutes for the 1230-image training subset to nearly 13 hours for the full dataset. Microsoft Azure completed all training within 3 hours, even when allocated 4&#x2010;5 hours, indicating that the cloud-based AutoML platform determined that additional training was unnecessary. Amazon Rekognition required the longest time, reaching nearly 40 hours for the full dataset. Vertex AI maintained consistent 2.5&#x2010;3-hour training times regardless of dataset size. <xref ref-type="table" rid="table3">Table 3</xref> provides an overview of training times and associated costs for each platform.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Training time and cost comparison across platforms.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Platform and training subset</td><td align="left" valign="bottom">Training time (hour:minute)</td><td align="left" valign="bottom">Training cost (US $)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Ultralytics YOLO11</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">00:38</td><td align="left" valign="top">&#x003C;1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">01:41</td><td align="left" valign="top">&#x003C;1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">03:37</td><td align="left" valign="top">&#x003C;1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">06:53</td><td align="left" valign="top">&#x003C;1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">12:53</td><td align="left" valign="top">&#x003C;1</td></tr><tr><td align="left" valign="top" colspan="3">Google Vertex AI<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">02:38</td><td align="left" valign="top">69.30</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">02:48</td><td align="left" valign="top">69.30</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">02:53</td><td align="left" valign="top">69.30</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">02:58</td><td align="left" valign="top">69.30</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">02:59</td><td align="left" valign="top">69.30</td></tr><tr><td align="left" valign="top" colspan="3">Microsoft Azure Custom Vision</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">00:57</td><td align="left" valign="top">9.50</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">01:03</td><td align="left" valign="top">10.50</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">01:02</td><td align="left" valign="top">10.33</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">01:37</td><td align="left" valign="top">16.10</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">02:52</td><td align="left" valign="top">28.60</td></tr><tr><td align="left" valign="top" colspan="3">Amazon Rekognition</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1230 images</td><td align="left" valign="top">04:55</td><td align="left" valign="top">5.43</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3450 images</td><td align="left" valign="top">09:28</td><td align="left" valign="top">10.48</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7380 images</td><td align="left" valign="top">17:04</td><td align="left" valign="top">18.88</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>14,400 images</td><td align="left" valign="top">25:18</td><td align="left" valign="top">27.98</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>26,880 images</td><td align="left" valign="top">39:41</td><td align="left" valign="top">43.89</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Performance on Edge Device</title><p>We evaluated Microsoft Azure and YOLO11 on a Raspberry Pi 5 using the exhaustive test set. Only these platforms were tested since Microsoft Azure alone provided downloadable Open Neural Network Exchange (ONNX) models for edge deployment. <xref ref-type="table" rid="table4">Table 4</xref> shows a significant performance trade-off between edge and cloud-based inference.</p><p>Microsoft Azure&#x2019;s performance decreased on-device (25.83% accuracy) compared to cloud inference (33.61% accuracy). YOLO11 maintained better consistency, achieving 43.33% accuracy with ONNX and 44.10% with the native .pt format. Inference times differed substantially: Microsoft Azure ONNX achieved 0.19 seconds per prediction, while YOLO11 required 1.38 seconds per prediction in ONNX format on the Raspberry Pi 5.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Performance metrics comparison for edge deployment.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Platform and weight format<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">mAP<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> @0.50</td><td align="left" valign="bottom">mAP@0.50&#x2010;0.95</td><td align="left" valign="bottom">Average prediction time (seconds)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="8">Ultralytics YOLO11</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ONNX<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">43.33%</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.44</td><td align="left" valign="top">1.38</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PyTorch (Meta AI; .pt)</td><td align="left" valign="top">44.10%</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.34</td><td align="left" valign="top">0.08</td></tr><tr><td align="left" valign="top" colspan="8">Microsoft Azure</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ONNX</td><td align="left" valign="top">25.83%</td><td align="left" valign="top">0.84</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.68</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.29</td><td align="left" valign="top">0.19</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>API<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> call</td><td align="left" valign="top">33.61%</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.68</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.83</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Models trained with the full dataset of 26,880 training images; metrics calculated at a confidence level of 0.5 and an intersection over union (IoU) threshold of 0.5.</p></fn><fn id="table4fn2"><p><sup>b</sup>mAP: mean average precision.</p></fn><fn id="table4fn3"><p><sup>c</sup>ONNX: Open Neural Network Exchange. </p></fn><fn id="table4fn4"><p><sup>d</sup>API: application programming interface.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our comprehensive evaluation of pill recognition models using code-based YOLO11 vs major code-free cloud-based AutoML platforms revealed that no single platform consistently outperforms others across all test scenarios, highlighting the importance of matching platform selection to specific deployment conditions and available training resources. The dramatic performance variations observed across clinical sites&#x2014;for example, ranging from 20.62% accuracy (Amazon Rekognition) to 65.42% accuracy (Vertex AI) on the same clinical hospital dataset when both were trained on the full 26,880-image training dataset&#x2014;demonstrate that imaging hardware, environmental, and personnel factors have a more tangible impact on model performance than platform choice alone.</p><p>Our results indicate that increasing the number of training images generally improves model performance across all metrics, although improvement patterns were not strictly linear and varied by platform, likely due to different underlying architectures and optimization strategies. Fair comparisons are challenging, especially for AutoML frameworks, which dynamically select the most suitable model architecture for each specific dataset and task. Consequently, different model types may be chosen (eg, a convolutional neural network in one instance and a vision transformer in another), even when trained on the same dataset.</p><p>Another critical finding is the substantial variation in how platforms balance the precision-recall trade-off across real-world clinical datasets. Amazon Rekognition&#x2019;s high precision across multiple clinical datasets often came at the cost of low recall, with values as low as 0.26 in one clinical dataset. This conservative approach resulted in the platform missing up to 74% of pills in the most challenging clinical dataset. Conversely, YOLO11&#x2019;s balanced precision-recall trade-off suggests suitability for scenarios where both false positives and false negatives carry clinical risk.</p><p>YOLO11 demonstrated the most consistent and predictable improvement trajectory with larger training datasets, achieving excellent mAP scores on most datasets, making it ideal for applications requiring precise object localization. The most dramatic improvement was observed on the most challenging clinical dataset, where accuracy increased from 22.71% to 64.58% and recall improved from 0.49 to 0.91. Consistently positive mAP trends across all datasets confirmed YOLO11&#x2019;s ability to effectively use additional training data.</p><p>Vertex AI presented more complex patterns. While generally improving with larger training datasets, performance fluctuations, including degradations, occurred in multiple instances with larger subsets. For instance, on the same clinical dataset, its accuracy dropped from 69.79% (1230-image training subset) to 57.08% (3450-image training subset), then recovered to 72.29% (7380-image training subset), dropped again to 62.08% (14,400-image training subset), and finally reached 71.04% (26,880-image full training dataset). This pattern likely reflects dynamic model architecture selection, where different model types are chosen for different dataset sizes. While resource-efficient, this behavior introduces unpredictability that could complicate training and deployment.</p><p>Custom Vision showed the most consistent behavior among cloud-based AutoML platforms, maintaining stable performance without degradation risks from model changes. Although Microsoft does not publicly disclose the model architecture used by Custom Vision, analyses of exported ONNX models suggest that the structure appears to be an older YOLOv2-based architecture, which would explain this consistency and the absence of architectural uncertainty we observed in other cloud-based AutoML platforms. Performance improvement trends paralleled YOLO11; however, the older architecture resulted in lower overall metrics.</p><p>Amazon Rekognition improved consistently with training data volume despite performing dynamic model selection during training. It demonstrated particularly strong performance on fully controlled datasets, especially in mAP values, with remarkable stability across IoU thresholds. Unlike other platforms, it showed minimal degradation at the 0.50 IoU threshold and maintained strong performance on the challenging mAP@0.50&#x2010;0.95 metric. It follows a conservative object detection strategy that prioritizes avoiding false positives, maintaining perfect precision (1.0) across multiple tests but with considerable false negative values indicative of missed detections, illustrated in the confusion matrix (<xref ref-type="fig" rid="figure5">Figure 5</xref>) with values listed in the background column. Analysis of confusion patterns revealed that the primary source of misclassification across all platforms was visual similarity among medications. Small, round, oblong, and oval white or off-white pills were most frequently confused with each other. Notably, these visually similar medications often represent therapeutically distinct drug classes (eg, beta-blockers, antiplatelet agents, and anxiolytics), where misidentification could have significant clinical consequences. Medications with distinctive visual characteristics, such as unique colors, shapes, or prominent surface markings, showed substantially lower misclassification rates across all platforms. The confusion matrices in <xref ref-type="fig" rid="figure5">Figure 5</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> illustrate these visual and platform-specific confusion patterns in detail.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Confusion matrix showing Amazon Rekognition performance on the verification dataset (trained on the full training dataset of 26,880 images).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e79160_fig05.png"/></fig><p>The cost-performance evaluation showed that although cloud-based AutoML platforms offer high accessibility, they also incur high platform costs, whereas traditional approaches, such as YOLO11, may provide comparable or better performance without platform costs. However, this does not account for the technical expertise required for YOLO11 implementation, which could offset the savings from platform costs. The primary cost consideration for YOLO11 implementation is human expertise rather than computational resources. Deployment requires proficiency in Linux system administration, Compute Unified Device Architecture, and CUDA Deep Neural Network library GPU driver configuration, Python programming, and deep learning frameworks such as PyTorch, typically representing senior AI and ML engineering expertise that substantially increases total implementation cost compared to the accessible web interfaces of cloud-based AutoML platforms. Our experiments used a workstation equipped with an NVIDIA Quadro RTX 5000 GPU, acquired for approximately US $3650 in 2021. The system consumes approximately 0.38 kW during training; consequently, the longest training run (approximately 13 hours) consumed an estimated 5 kWh of electricity, representing a negligible operational cost of less than US $1 at typical regional commercial electricity rates.</p><p>Among the cloud-based AutoML platforms, Vertex AI showed robustness with numerous comparative advantages, including high accuracy and <italic>F</italic><sub>1</sub> performance values across various datasets, with relatively fast training time and good results on different training subsets, albeit with unpredictable fluctuations and significant cost barriers (higher flat-rate pricing and continued billing even when the endpoint is idle and the object detection model is not in use), which may limit its adoption in resource-constrained health care settings.</p></sec><sec id="s4-2"><title>Limitations</title><p>The broader context of AI deployment in health care encompasses critical considerations beyond the scope of this study, including data security, privacy, and interoperability with existing hospital systems and protocols. Several limitations should be considered when interpreting our results. The dynamic model selection by cloud-based AutoML platforms prevented direct architectural comparisons, as different underlying models were selected for different dataset sizes. Edge device testing was limited to Microsoft Azure and YOLO11, as other cloud-based AutoML platforms do not support ONNX model export. Performance metrics were evaluated at a conventional confidence threshold of 0.5, which may not be optimal for all platforms or deployment scenarios. The absence of CIs and formal statistical testing across hospital sites may be viewed as a limitation. However, the cloud-based AutoML platforms evaluated do not support the cross-validation procedures necessary for consistent variance estimation, and our study design prioritized assessment of the effects of real-world human and environmental factors on model performance. Several potential sources of bias should be considered when interpreting our results. The study was limited to 30 medications representing commonly dispensed products in Hungarian clinical settings, which does not encompass all possible visual characteristics encountered across different health care contexts or geographic regions. Performance may vary on larger medication databases, different medication types, and on medications with visual characteristics underrepresented in our dataset. While our clinical dataset was designed to assess real-world deployment challenges, it may confound interpretation of site-specific performance differences, as observed performance variations reflect the combined effects of platform capabilities and local imaging conditions rather than either factor in isolation. Collection of patient-based prescription images would face substantial ethical, regulatory, and practical barriers; therefore, in the creation of our clinical dataset, we used purposive sampling with balanced class representation to enable fair cross-platform comparison. While this approach is a standard methodology for ML benchmark studies and aligns with established datasets, it does not reflect natural prescription prevalence patterns. Future implementation studies should use representative sampling within appropriate ethical frameworks to validate model performance under authentic clinical workflow conditions.</p></sec><sec id="s4-3"><title>Practical Implications</title><p>The substantial variation in performance observed across different clinical settings highlights the importance of environmental factors in model performance. The marked performance degradation observed in certain clinical settings indicates that local validation is essential before deployment.</p><p>When selecting platforms, users should consider each platform&#x2019;s unique characteristics, especially when working with smaller training datasets. YOLO11 offers competitive performance without platform costs and provides superior edge device compatibility for cost-sensitive applications requiring local processing. However, larger training datasets may be necessary for optimal performance. When evaluating the total cost of ownership, YOLO11&#x2019;s zero platform cost must be weighed against both hardware investment and the human capital required for implementation and ongoing maintenance after deployment. Consequently, cloud-based AutoML platforms, despite higher per-use costs, may prove more economical for some organizations and deployment scenarios. Vertex AI offers strong performance and ease of use despite higher costs, making it suitable for scenarios in which access to technical expertise is limited. Its fixed pricing model can be advantageous for situations requiring frequent processing of large volumes of images. Custom Vision and Amazon Rekognition are more economical for smaller-scale deployments, given their pricing structures. Custom Vision delivers predictable performance improvements, while Rekognition provides high-precision detection. Platform selection ultimately requires balancing technical requirements, budget constraints, and deployment scale to achieve optimal clinical pill recognition performance. For real-time point-of-care verification, response times below 2&#x2010;3 seconds generally integrate well into verification workflows without causing perceptible delays that could reduce productivity, introduce operator fatigue, or disrupt routine workflow rhythm. All prediction times observed in our study fall within this acceptable range, suggesting that computational speed is unlikely to be a limiting factor for any of the evaluated platforms.</p><p>The clinical implications of different error types warrant careful consideration for deployment planning. Missed detections and misidentifications pose a significant risk if undetected, potentially leading to missed therapeutic effects, unexpected adverse events, or dangerous drug interactions. Our findings reinforce the necessity of manual verification during clinical workflows for maintaining safety. Automated dispensing and verification systems should augment rather than replace pharmacist verification, with particular vigilance required for medications exhibiting high visual similarity to other products in the hospital formulary.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study provides the first comprehensive comparison of a traditional code-based model and cloud-based AutoML approaches for pill recognition model training intended to be used in clinical settings. While no single platform dominated across all test scenarios, each demonstrated distinct advantages for specific use cases. YOLO11 proved reliable for model development and training due to its flexibility, cost-effectiveness, and edge deployment capabilities. Vertex AI delivered a strong overall performance with minimal technical expertise required. The substantial performance differences (20.62%-91.60% difference in accuracy) observed across UCTS underscore the critical importance of imaging conditions and the value of rigorous validation for real-world applications.</p><p>Platform selection for pill recognition should consider (1) available technical expertise and resources, (2) deployment environment characteristics, (3) budget constraints, and (4) edge deployment requirements. Future research should explore performance on larger medication databases and develop standardized imaging protocols to achieve consistent results for training pill recognition systems. As both traditional deep learning frameworks and cloud-based AutoML platforms continue to evolve, periodic systematic evaluation of these technologies using existing pill image datasets could provide valuable insights into how these technologies can be used to improve medication safety through automated verification systems.</p></sec></sec></body><back><ack><p>The authors would like to sincerely thank the clinical pharmacists and staff at the Central Clinical Pharmacy of the University of P&#x00E9;cs, the Somogy County Kaposi M&#x00F3;r Teaching Hospital in Kaposv&#x00E1;r, and the Healthcare Center in Koml&#x00F3; for their assistance in collecting the clinical dataset images used in this study. We are grateful to NVIDIA Corporation for supporting our research through graphics processing units (GPUs) provided by the NVIDIA Hardware Grant Program.</p></ack><notes><sec><title>Funding</title><p>This study was funded by the University Research Scholarship Program of the Hungarian National Research, Development, and Innovation Fund (EK&#x00D6;P-24-4-I-PTE-57) and the Hungarian Scientific Research Fund grant (OTKA K-135729). The funder had no role in the study design, data collection and analysis, the decision to publish, or the preparation of the manuscript.</p></sec><sec><title>Data Availability</title><p>The data analyzed in this study are available from the corresponding author upon reasonable request. Requests should be sent to Amir Reza Ashraf at ashraf.amir.reza@pte.hu, specifying the type of data needed and its intended use. The corresponding author may provide the data subject to applicable conditions.</p></sec></notes><fn-group><fn fn-type="con"><p>ARA contributed to conceptualization, methodology, data curation, investigation, funding acquisition, project administration, writing the original draft, and reviewing and editing the manuscript. RR contributed to conceptualization, methodology, data curation, investigation, formal analysis, validation, writing the original draft, reviewing and editing the manuscript, and visualization. ZV contributed to methodology, visualization, funding acquisition, and reviewing and editing the manuscript. AF contributed to reviewing and editing the manuscript and supervision.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">AutoML</term><def><p>automated machine learning</p></def></def-item><def-item><term id="abb4">AWS</term><def><p>Amazon Web Services</p></def></def-item><def-item><term id="abb5">FNR</term><def><p>false negative rate</p></def></def-item><def-item><term id="abb6">GPU</term><def><p>graphics processing unit</p></def></def-item><def-item><term id="abb7">IoU</term><def><p>intersection over union</p></def></def-item><def-item><term id="abb8">mAP</term><def><p>mean average precision</p></def></def-item><def-item><term id="abb9">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb10">OER</term><def><p>overall error rate</p></def></def-item><def-item><term id="abb11">ONNX</term><def><p>Open Neural Network Exchange</p></def></def-item><def-item><term id="abb12">SDK</term><def><p>software development kit</p></def></def-item><def-item><term id="abb13">UCTS</term><def><p>uncontrolled clinical test set</p></def></def-item><def-item><term id="abb14">YOLO</term><def><p>You Only Look Once</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Global burden of preventable medication-related harm in health care: a systematic review</article-title><source>World Health Organization</source><year>2023</year><access-date>2025-06-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/publications/i/item/9789240088887">https://www.who.int/publications/i/item/9789240088887</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elliott</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Camacho</surname><given-names>E</given-names> </name><name name-style="western"><surname>Jankovic</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sculpher</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Faria</surname><given-names>R</given-names> </name></person-group><article-title>Economic analysis of the prevalence and clinical and economic burden of medication error in England</article-title><source>BMJ Qual Saf</source><year>2021</year><month>02</month><volume>30</volume><issue>2</issue><fpage>96</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2019-010206</pub-id><pub-id pub-id-type="medline">32527980</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Slight</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Seger</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Franz</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bates</surname><given-names>DW</given-names> </name></person-group><article-title>The national cost of adverse drug events resulting from inappropriate medication-related alert overrides in the United States</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>09</month><day>1</day><volume>25</volume><issue>9</issue><fpage>1183</fpage><lpage>1188</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy066</pub-id><pub-id pub-id-type="medline">29939271</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hodkinson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tyler</surname><given-names>N</given-names> </name><name name-style="western"><surname>Ashcroft</surname><given-names>DM</given-names> </name><etal/></person-group><article-title>Preventable medication harm across health care settings: a systematic review and meta-analysis</article-title><source>BMC Med</source><year>2020</year><month>11</month><day>6</day><volume>18</volume><issue>1</issue><fpage>313</fpage><pub-id pub-id-type="doi">10.1186/s12916-020-01774-9</pub-id><pub-id pub-id-type="medline">33153451</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gillani</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Gulam</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Role and services of a pharmacist in the prevention of medication errors: a systematic review</article-title><source>Curr Drug Saf</source><year>2021</year><volume>16</volume><issue>3</issue><fpage>322</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.2174/1574886315666201002124713</pub-id><pub-id pub-id-type="medline">33006539</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lester</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rowell</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kontar</surname><given-names>RA</given-names> </name></person-group><article-title>Performance evaluation of a prescription medication image classification model: an observational cohort</article-title><source>NPJ Digit Med</source><year>2021</year><month>07</month><day>27</day><volume>4</volume><issue>1</issue><fpage>118</fpage><pub-id pub-id-type="doi">10.1038/s41746-021-00483-8</pub-id><pub-id pub-id-type="medline">34315995</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lusk</surname><given-names>C</given-names> </name><name name-style="western"><surname>Catchpole</surname><given-names>K</given-names> </name><name name-style="western"><surname>Neyens</surname><given-names>DM</given-names> </name><etal/></person-group><article-title>Improving safety in the operating room: medication icon labels increase visibility and discrimination</article-title><source>Appl Ergon</source><year>2022</year><month>10</month><volume>104</volume><fpage>103831</fpage><pub-id pub-id-type="doi">10.1016/j.apergo.2022.103831</pub-id><pub-id pub-id-type="medline">35717790</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Batson</surname><given-names>S</given-names> </name><name name-style="western"><surname>Herranz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rohrbach</surname><given-names>N</given-names> </name><name name-style="western"><surname>Canobbio</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Bonnabry</surname><given-names>P</given-names> </name></person-group><article-title>Automation of in-hospital pharmacy dispensing: a systematic review</article-title><source>Eur J Hosp Pharm</source><year>2021</year><month>03</month><volume>28</volume><issue>2</issue><fpage>58</fpage><lpage>64</lpage><pub-id pub-id-type="doi">10.1136/ejhpharm-2019-002081</pub-id><pub-id pub-id-type="medline">32434785</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>H&#x00E4;nninen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ahtiainen</surname><given-names>HK</given-names> </name><name name-style="western"><surname>Suvikas-Peltonen</surname><given-names>EM</given-names> </name><name name-style="western"><surname>T&#x00F6;tterman</surname><given-names>AM</given-names> </name></person-group><article-title>Automated unit dose dispensing systems producing individually packaged and labelled drugs for inpatients: a systematic review</article-title><source>Eur J Hosp Pharm</source><year>2023</year><month>05</month><volume>30</volume><issue>3</issue><fpage>127</fpage><lpage>135</lpage><pub-id pub-id-type="doi">10.1136/ejhpharm-2021-003002</pub-id><pub-id pub-id-type="medline">34795001</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Diwan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Anirudh</surname><given-names>G</given-names> </name><name name-style="western"><surname>Tembhurne</surname><given-names>JV</given-names> </name></person-group><article-title>Object detection using YOLO: challenges, architectural successors, datasets and applications</article-title><source>Multimed Tools Appl</source><year>2023</year><volume>82</volume><issue>6</issue><fpage>9243</fpage><lpage>9275</lpage><pub-id pub-id-type="doi">10.1007/s11042-022-13644-y</pub-id><pub-id pub-id-type="medline">35968414</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ragab</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Abdulkadir</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Muneer</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A comprehensive systematic review of YOLO for medical object detection (2018 to 2023)</article-title><source>IEEE Access</source><year>2024</year><volume>12</volume><fpage>57815</fpage><lpage>57836</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2024.3386826</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Huangfu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name></person-group><article-title>Comparison of RetinaNet, SSD, and YOLO v3 for real-time pill identification</article-title><source>BMC Med Inform Decis Mak</source><year>2021</year><month>11</month><day>22</day><volume>21</volume><issue>1</issue><fpage>324</fpage><pub-id pub-id-type="doi">10.1186/s12911-021-01691-8</pub-id><pub-id pub-id-type="medline">34809632</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>R&#x00E1;dli</surname><given-names>R</given-names> </name><name name-style="western"><surname>V&#x00F6;r&#x00F6;sh&#x00E1;zi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cz&#x00FA;ni</surname><given-names>L</given-names> </name></person-group><article-title>Metric&#x2010;based pill recognition with the help of textual and visual cues</article-title><source>IET Image Process</source><year>2024</year><month>12</month><volume>18</volume><issue>14</issue><fpage>4623</fpage><lpage>4638</lpage><pub-id pub-id-type="doi">10.1049/ipr2.13273</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Khanam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hussain</surname><given-names>M</given-names> </name></person-group><article-title>YOLOv11: an overview of the key architectural enhancements</article-title><source>arXiv</source><access-date>2025-04-14</access-date><comment>Preprint posted online on  Oct 23, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2410.17725">https://arxiv.org/abs/2410.17725</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.17725</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ashraf</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Somogyi-V&#x00E9;gh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Merczel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gyimesi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Fittler</surname><given-names>A</given-names> </name></person-group><article-title>Leveraging code-free deep learning for pill recognition in clinical settings: a multicenter, real-world study of performance across multiple platforms</article-title><source>Artif Intell Med</source><year>2024</year><month>04</month><volume>150</volume><fpage>102844</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2024.102844</pub-id><pub-id pub-id-type="medline">38553153</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Tong</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>R</given-names> </name><etal/></person-group><article-title>An easy method for identifying 315 categories of commonly-used Chinese herbal medicines based on automated image recognition using AutoML platforms</article-title><source>Inform Med Unlocked</source><year>2021</year><volume>25</volume><fpage>100607</fpage><pub-id pub-id-type="doi">10.1016/j.imu.2021.100607</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wan</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Ip</surname><given-names>HF</given-names> </name><etal/></person-group><article-title>Evaluation of the performance of traditional machine learning algorithms, convolutional neural network and AutoML vision in ultrasound breast lesions classification: a comparative study</article-title><source>Quant Imaging Med Surg</source><year>2021</year><month>04</month><volume>11</volume><issue>4</issue><fpage>1381</fpage><lpage>1393</lpage><pub-id pub-id-type="doi">10.21037/qims-20-922</pub-id><pub-id pub-id-type="medline">33816176</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sapkota</surname><given-names>R</given-names> </name><name name-style="western"><surname>Qureshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Flores-Calero</surname><given-names>M</given-names> </name><etal/></person-group><article-title>YOLO11 to its genesis: a decadal and comprehensive review of the You Only Look Once (YOLO) series</article-title><source>arXiv</source><access-date>2025-04-16</access-date><comment>Preprint posted online on  Jun 12, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2406.19407v5">https://arxiv.org/abs/2406.19407v5</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.19407</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayyalasomayajula</surname><given-names>MMT</given-names> </name><name name-style="western"><surname>Chintala</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ayyalasomayajula</surname><given-names>S</given-names> </name></person-group><article-title>A cost-effective analysis of machine learning workloads in public clouds: is AutoML always worth using?</article-title><source>Int J Comput Sci Trends Technol</source><year>2019</year><access-date>2025-04-16</access-date><issue>5</issue><fpage>107</fpage><lpage>115</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.ijcstjournal.org/volume-7/issue-5/IJCST-V7I5P14.pdf">https://www.ijcstjournal.org/volume-7/issue-5/IJCST-V7I5P14.pdf</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Gui</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>F</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name></person-group><article-title>AutoML in the wild: obstacles, workarounds, and expectations</article-title><year>2023</year><month>04</month><day>19</day><conf-name>Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems</conf-name><conf-date>Apr 23-28, 2023</conf-date><conf-loc>Hamburg, Germany</conf-loc><fpage>1</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.1145/3544548.3581082</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>What is Custom Vision?</article-title><source>Microsoft</source><access-date>2025-04-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://learn.microsoft.com/en-us/azure/ai-services/custom-vision-service/overview">https://learn.microsoft.com/en-us/azure/ai-services/custom-vision-service/overview</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Vertex AI documentation</article-title><source>Google</source><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cloud.google.com/vertex-ai/docs">https://cloud.google.com/vertex-ai/docs</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>Interfaces for Vertex AI</article-title><source>Google</source><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cloud.google.com/vertex-ai/docs/start/introduction-interfaces">https://cloud.google.com/vertex-ai/docs/start/introduction-interfaces</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>What is Amazon Rekognition Custom Labels?</article-title><source>Amazon Web Services</source><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.aws.amazon.com/rekognition/latest/customlabels-dg/what-is.html">https://docs.aws.amazon.com/rekognition/latest/customlabels-dg/what-is.html</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ling</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pastor</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Few-shot pill recognition</article-title><conf-name>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 13-19, 2020</conf-date><pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00981</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yaniv</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Faruque</surname><given-names>J</given-names> </name><name name-style="western"><surname>Howe</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The National Library of Medicine pill image recognition challenge: an initial report</article-title><source>IEEE Appl Imag Pattern Recognit Workshop</source><year>2016</year><month>10</month><volume>2016</volume><fpage>1</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1109/AIPR.2016.8010584</pub-id><pub-id pub-id-type="medline">29854569</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>What&#x2019;s new in Custom Vision?</article-title><source>Microsoft</source><access-date>2025-04-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://learn.microsoft.com/en-us/azure/ai-services/custom-vision-service/whats-new">https://learn.microsoft.com/en-us/azure/ai-services/custom-vision-service/whats-new</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>Prepare image training data for object detection</article-title><source>Google</source><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cloud.google.com/vertex-ai/docs/image-data/object-detection/prepare-data">https://cloud.google.com/vertex-ai/docs/image-data/object-detection/prepare-data</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>About data splits for AutoML models</article-title><source>Google</source><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cloud.google.com/vertex-ai/docs/general/ml-use">https://cloud.google.com/vertex-ai/docs/general/ml-use</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>Managing labels</article-title><source>Amazon Web Services</source><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.aws.amazon.com/rekognition/latest/customlabels-dg/md-labels.html">https://docs.aws.amazon.com/rekognition/latest/customlabels-dg/md-labels.html</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>Creating training and test datasets with images</article-title><source>Amazon Web Services</source><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.aws.amazon.com/rekognition/latest/customlabels-dg/md-create-dataset.html">https://docs.aws.amazon.com/rekognition/latest/customlabels-dg/md-create-dataset.html</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Pricing</article-title><source>Google</source><access-date>2025-06-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cloud.google.com/vertex-ai/pricing">https://cloud.google.com/vertex-ai/pricing</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Amazon Rekognition pricing</article-title><source>Amazon Web Services</source><access-date>2025-06-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/rekognition/pricing/">https://aws.amazon.com/rekognition/pricing/</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Azure AI Custom Vision Service</article-title><source>Microsoft</source><access-date>2025-06-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://azure.microsoft.com/en-us/pricing/details/cognitive-services/custom-vision-service/">https://azure.microsoft.com/en-us/pricing/details/cognitive-services/custom-vision-service/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>General structure of the YOLO11 model.</p><media xlink:href="medinform_v14i1e79160_app1.docx" xlink:title="DOCX File, 73 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Evaluation metrics.</p><media xlink:href="medinform_v14i1e79160_app2.docx" xlink:title="DOCX File, 343 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>List and properties of medications included in the study.</p><media xlink:href="medinform_v14i1e79160_app3.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Performance metrics of pill recognition models across different datasets.</p><media xlink:href="medinform_v14i1e79160_app4.docx" xlink:title="DOCX File, 56 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Additional confusion matrices.</p><media xlink:href="medinform_v14i1e79160_app5.docx" xlink:title="DOCX File, 4111 KB"/></supplementary-material></app-group></back></article>