<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e35987</article-id><article-id pub-id-type="doi">10.2196/35987</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Detection of Polyphonic Alarm Sounds From Medical Devices Using Frequency-Enhanced Deep Learning: Simulation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kishimoto</surname><given-names>Kazumasa</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Takemura</surname><given-names>Tadamasa</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sugiyama</surname><given-names>Osamu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kojima</surname><given-names>Ryosuke</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yakami</surname><given-names>Masahiro</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yamamoto</surname><given-names>Goshiro</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kuroda</surname><given-names>Tomohiro</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Graduate School of Informatics, Kyoto University</institution><addr-line>54 Kawara-cho, Shogoin, Sakyo-ku</addr-line><addr-line>Kyoto</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Graduate School of Medicine, Kyoto University</institution><addr-line>Kyoto</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Kyoto University Hospital</institution><addr-line>Kyoto</addr-line><country>Japan</country></aff><aff id="aff4"><institution>Graduate School of Information Science, University of Hyogo</institution><addr-line>Kobe</addr-line><country>Japan</country></aff><aff id="aff5"><institution>Department of Information Science, Kindai University</institution><addr-line>Osaka</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hori</surname><given-names>Kenta</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Reis</surname><given-names>Manuel J C S</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Kazumasa Kishimoto, PhD, Graduate School of Informatics, Kyoto University, 54 Kawara-cho, Shogoin, Sakyo-ku, Kyoto, 591-8022, Japan, 81 75-366-7701; <email>kishimoto@kuhp.kyoto-u.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>12</day><month>11</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e35987</elocation-id><history><date date-type="received"><day>30</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>27</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Kazumasa Kishimoto, Tadamasa Takemura, Osamu Sugiyama, Ryosuke Kojima, Masahiro Yakami, Goshiro Yamamoto, Tomohiro Kuroda. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 12.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e35987"/><abstract><sec><title>Background</title><p>Although an increasing number of bedside medical devices are equipped with wireless connections for reliable notifications, many nonnetworked devices remain effective at detecting abnormal patient conditions and alerting medical staff through auditory alarms. Staff members, however, can miss these notifications, especially when in distant areas or other private rooms. In contrast, the signal-to-noise ratio of alarm systems for medical devices in the neonatal intensive care unit is 0 dB or higher. A feasible system for automatic sound identification with high accuracy is needed to prevent alarm sounds from being missed by the staff.</p></sec><sec><title>Objective</title><p>The purpose of this study was to design a method for classifying multiple alarm sounds collected with a monaural microphone in a noisy environment.</p></sec><sec sec-type="methods"><title>Methods</title><p>Features of 7 alarm sounds were extracted using a mel filter bank and incorporated into a classifier using convolutional and recurrent neural networks. To estimate its clinical usefulness, the classifier was evaluated with mixtures of up to 7 alarm sounds and hospital ward noise.</p></sec><sec sec-type="results"><title>Results</title><p>The proposed convolutional recurrent neural network model was evaluated using a simulation dataset of 7 alarm sounds mixed with hospital ward noise. At a signal-to-noise ratio of 0 dB, the best-performing model (convolutional neural network 3+bidirectional gate recurrent unit) achieved an event-based <italic>F</italic><sub>1</sub>-score of 0.967, with a precision of 0.944 and a recall of 0.991. When the venous foot pump class was excluded, the classwise recall of the classifier ranged from 0.990 to 1.000.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The proposed classifier was found to be highly accurate in detecting alarm sounds. Although the performance of the proposed classifier in a clinical environment can be improved, the classifier could be incorporated into an alarm sound detection system. The classifier, combined with network connectivity, could improve the notification of abnormal status detected by unconnected medical devices.</p></sec></abstract><kwd-group><kwd>sound event detection</kwd><kwd>deep learning</kwd><kwd>alarm sound</kwd><kwd>polyphonic sound</kwd><kwd>notifications</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>While an increasing number of bedside medical devices, such as syringe pumps, have wireless connections that enable reliable data transmission to hospital information systems, many nonnetworked devices are still used in general hospital wards. Although dongles connected to the external output terminal of these devices may allow wireless connections [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], most devices are not equipped with external output terminals. Instead, these devices use auditory alert signals (alarm sounds) to notify medical staff of abnormal conditions. Medical staff members may not hear these alarms, especially when they are in distant areas or other private rooms. Between 2010 and 2015, the Japan Council for Quality Health Care reported 173 accidents and other incidents, including 23 cases of unnoticed alarms [<xref ref-type="bibr" rid="ref3">3</xref>]. The report included the following comments about environmental factors:</p><disp-quote><p>There are many blind spots due to the facility&#x2019;s structure.</p></disp-quote><disp-quote><p>When I entered a patient&#x2019;s room, I could not hear alarm sounds from another room.</p></disp-quote><disp-quote><p>The alarm sound did not reach the staff because the room was far from the nurse station.</p></disp-quote><disp-quote><p>The staff could not hear the alarm in the farthest private room in the ICU.</p></disp-quote><disp-quote><p>The recording room was structured so that the alarm sound could not be heard.</p></disp-quote><p>These findings indicate the need for reliable alarm notification to ensure patient safety. Alarm sounds emitted by medical devices are regulated by the International Organization for Standardization and the International Electrotechnical Commission (ISO/IEC) 60601-1-8. This standard specifies the melodies and lengths of alarm sounds to reduce the risk of misunderstanding, confusion, and omission of alarm sounds from various medical devices, even when these sounds overlap and reverberate. This standard prescribes that the sounds should be organized based on the priority of corresponding abnormal situations, with alarm sounds for different situations varying in melody and length. Thus, sound event detection (SED) is expected to identify every kind of abnormal situation detected by medical devices [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. The standard also defines visual alarm signals, but monitoring the signals of multiple devices with cameras is not feasible without blind spots.</p><p>As SED can be implemented using a single (monaural) microphone, it was selected as the approach to detect alarm status. Clinical application of SED requires robustness against noise because environmental noise in hospital wards is generally substantial. SEDs with deep learning have been found to be sufficiently robust against noise [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>This study proposes a deep learning&#x2013;based method for classifying patient abnormalities detected by medical devices with polyphonic alarm sounds collected with a monaural microphone. The ability of the classifier to identify abnormal states of these devices was evaluated using simulation datasets of their alarm sounds superimposed on hospital ward noise (HWN). Therefore, the objective of this study was to design and evaluate a convolutional recurrent neural network (CRNN) for accurately detecting and classifying multiple, overlapping alarm sounds from medical devices in a simulated noisy hospital environment. We hypothesized that a hybrid CRNN model could achieve high performance suitable for clinical application by effectively capturing both the spectral and temporal features of the alarm sounds.</p></sec><sec id="s1-2"><title>Related Works</title><p>Accurate transmission of alarms from unconnected medical devices requires precise recognition of visual and auditory alarms. Several studies have reported high accuracy in detecting simultaneous alarm sounds mixed with a substantial level of environmental noise [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. For SED, deep learning is more robust against noise than conventional methods [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. On the basis of these studies, an edge device was placed at the patient&#x2019;s bedside to monitor abnormal conditions detected by multiple medical devices with individual alarm sounds. In our previous study, the classifier had <italic>F</italic><sub>1</sub>-scores of 0.727 at signal-to-noise ratios (SNRs) of 0 dB and applied a convolutional neural network (CNN) [<xref ref-type="bibr" rid="ref11">11</xref>]. To our knowledge, no previous study has used a deep learning&#x2013;based recurrent neural network (RNN) to detect polyphonic alarm sounds emitted by medical devices. Recent advances in attention-based architectures, such as the audio spectrogram transformer and conformer, have demonstrated strong performance in general SED tasks [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. These models use self-attention mechanisms to capture long-range dependencies, potentially enhancing robustness in noisy environments. Exploring such transformer-based approaches for clinical alarm sound detection remains an important direction for future research.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>The first approach to SED combines a Gaussian mixture model with a hidden Markov model, using features such as the mel-frequency cepstral coefficient from traditional methods of speech recognition [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Other approaches include the separation of sound sources by matching them using a template extracted from the input sound. This can be achieved by sound source separation techniques, such as nonnegative matrix factorization. Nonnegative matrix factorization monitors a single signal to create a basis matrix and identifies the separated sounds [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Recent approaches based on neural networks have significantly improved the performance of SED. One approach consists of SED of real-life sounds with feedforward neural networks based on a multilayer perceptron trained in a spectrum of mixed sounds [<xref ref-type="bibr" rid="ref6">6</xref>]. An RNN with the ability to remember past states can process sequential information of the acoustic signal. RNNs with bidirectional long short-term memory have achieved excellent results in complex audio detection such as speech recognition and polyphonic piano note transcription [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Furthermore, CNNs commonly used in image recognition can robustly predict sounds with its filter shifted by both time and frequency axes [<xref ref-type="bibr" rid="ref22">22</xref>]. However, long-term prediction remains difficult due to the limited width of the time window [<xref ref-type="bibr" rid="ref9">9</xref>]. Therefore, although alarm sounds consist of relatively simple tones, it is necessary to predict not only the frequency axis but also the time axis to inform the priority with a pattern. Application of an RNN to polyphonic SED enabled long-term prediction by integrating the information over the time window. This study combines the strengths of both CNN and RNN to benefit from both approaches. A similar approach has shown excellent performance in automatic speech recognition [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>Therefore, this approach was expected to achieve sufficient performance for potential clinical application in alarm sound recognition.</p></sec><sec id="s2-2"><title>Deep Alarm Sound Detection</title><sec id="s2-2-1"><title>Experiment Overview</title><p>The proposed polyphonic alarm sound detection consisted of feature extraction, classification algorithms, and model training (<xref ref-type="fig" rid="figure1">Figure 1</xref>). A mel filter bank (MFB) was used to extract the features of alarm sounds, and CNN and RNN were applied to the classifier. As deep learning requires substantial training data, a large amount of acoustic data were recorded in a quiet room. The recorded data were mixed with pseudo noise before being used for training. This approach aimed to maximize the generalizability of the classifier for expected use in a noisy environment. The feature extraction step extracted data to be input to the classifier from the collected alarm sounds. The classification algorithms were designed to use deep learning models for classifiers. The model training augmented the data to create a robust model.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the experiment. SNR: signal-to-noise ratio.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e35987_fig01.png"/></fig></sec><sec id="s2-2-2"><title>Feature Extraction</title><p>The features of polyphonic alarm sounds of bedside medical devices were extracted. Many acoustic classifiers use log mel spectrogram multiplied spectra and MFB based on the characteristics underlying human frequency perception [<xref ref-type="bibr" rid="ref25">25</xref>]. The acoustic data were transformed to a power spectrogram with the short-time Fourier transform of the Hamming window, which had a window size of 1024 and a hop length of 512. The inner product on the power spectrogram was calculated with MFB, and its logarithms in 40 dimensions were calculated.</p></sec><sec id="s2-2-3"><title>Classification Algorithms</title><p>The CRNN, a combination of CNN and RNN, was used for the classification model (<xref ref-type="fig" rid="figure2">Figure 2</xref>) [<xref ref-type="bibr" rid="ref26">26</xref>]. This system used 256&#x00D7;40 pixels of the log mel spectrogram as a feature. A CNN block in the proposed model consisted of a convolutional layer with 128 filters, batch normalization, the activation function of the rectified linear unit, and a dropout of 50% [<xref ref-type="bibr" rid="ref27">27</xref>]. Using the size reduction method, the max pooling layer was applied to the frequency axis. In image recognition, replacing the pooling layer with a CNN stride 2 has been reported to improve its performance by reducing the calculation cost [<xref ref-type="bibr" rid="ref28">28</xref>]. In this study, the size of the stride to the frequency axis was reduced to 2, and the pooling layer was excluded because the classification target was 1 frame of the sound.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Architecture of the proposed convolutional recurrent neural network. The architecture consists of three main components: (1) convolutional neural network (CNN) block (convolutional layers with batch normalization, rectified linear unit [ReLU] activation, max pooling, and dropout), (2) recurrent neural network (RNN) block (bidirectional gate recurrent unit [BiGRU] or bidirectional simple recurrent neural network [BiSRNN] layers), and (3) time-distributed dense with sigmoid activation for classification.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e35987_fig02.png"/></fig><p>The RNN block consisted of a bidirectional simple RNN (BiSRNN) or bidirectional gated recurrent unit (BiGRU). The input was set to 32 units, the activation function to tanh, the recurrent activation function to the hard sigmoid, and the dropout of each layer to 50%.</p><p>Finally, the activation function of the fully connected layer was set to sigmoid, the optimization function used was Adam [<xref ref-type="bibr" rid="ref29">29</xref>], and the loss function was set to binary cross-entropy. Each of the 7 sound event classes had output values in the range of [0, 1] [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p><xref ref-type="table" rid="table1">Table 1</xref> shows the details of the proposed model. CNN3+BiSRNN was used as the baseline model. The environment was built with Python programming language (version 3.7.11; Python Software Foundation), using Keras 2.3.1 for the deep learning library (TensorFlow 2.0.0 for the back end) and Librosa 0.8.1 for the acoustic analysis module. The experimental code is available online[<xref ref-type="bibr" rid="ref30">30</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Details of the proposed model.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model name</td><td align="left" valign="bottom" colspan="4">CNN<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> block</td><td align="left" valign="bottom" colspan="2">RNN<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> block</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Layer</td><td align="left" valign="bottom">Number of layers</td><td align="left" valign="bottom">Stride</td><td align="left" valign="bottom">Max pooling<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom">Layer</td><td align="left" valign="bottom">Number of layers</td></tr></thead><tbody><tr><td align="left" valign="top">CNN3+BiSRNN<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> (baseline)</td><td align="left" valign="top">Conv2D<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top">3</td><td align="left" valign="top">1&#x00D7;1</td><td align="left" valign="top">5, 2, 2</td><td align="left" valign="top">BiSRNN</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">CNN3+BiGRU<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td><td align="left" valign="top">Conv2D</td><td align="left" valign="top">3</td><td align="left" valign="top">1&#x00D7;1</td><td align="left" valign="top">5, 2, 2</td><td align="left" valign="top">BiGRU</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">CNN4+BiGRU</td><td align="left" valign="top">Conv2D</td><td align="left" valign="top">4</td><td align="left" valign="top">1&#x00D7;1</td><td align="left" valign="top">2, 2, 2, 2</td><td align="left" valign="top">BiGRU</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">ALL-CNN4+BiGRU<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td><td align="left" valign="top">Conv2D</td><td align="left" valign="top">4</td><td align="left" valign="top">1&#x00D7;2</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td><td align="left" valign="top">BiGRU</td><td align="left" valign="top">2</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CNN: convolutional neural network.</p></fn><fn id="table1fn2"><p><sup>b</sup>The figures denote frequency axis. Time axis=1 (ie,1&#x00D7;5=5).</p></fn><fn id="table1fn3"><p><sup>c</sup>RNN: recurrent neural network.</p></fn><fn id="table1fn4"><p><sup>d</sup>BiSRNN: bidirectional simple recurrent neural network.</p></fn><fn id="table1fn5"><p><sup>e</sup>Conv2D: two-dimensional convolution.</p></fn><fn id="table1fn6"><p><sup>f</sup>BiGRU: bidirectional gated recurrent unit.</p></fn><fn id="table1fn7"><p><sup>g</sup>ALL-CNN: customizing stride without max pooling.</p></fn><fn id="table1fn8"><p><sup>h</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2-4"><title>Model Training</title><p>Data augmentation was applied to the training dataset to prevent overfitting of the classifier and to provide robust performance in simulation experiments [<xref ref-type="bibr" rid="ref31">31</xref>]. The 7 alarm sounds were superimposed on white noise at SNRs ranging from 30 to 0 dB in 1 dB steps. In addition, SpecAugment was applied to the generator, with 1 random mask added for each frequency and time axis, followed by performance of 5 steps per minibatch [<xref ref-type="bibr" rid="ref32">32</xref>].</p><p>These steps produced a trained model using 5-fold cross-validation over 150 training epochs and confirmed that the learning curve showed no signs of overfitting.</p></sec></sec><sec id="s2-3"><title>Evaluation</title><sec id="s2-3-1"><title>Data Collection</title><p>The medical devices selected included those frequently used for ventilator-equipped patients who require many medical devices in the general ward of the hospital. Alarm sounds to be identified included pulse sounds from a syringe pump (SP), enteral feeding pump (ENP), and venous foot pump (VFP) device as well as burst sounds from an infusion pump (IP), chest drainage (CD), patient monitor (PM), and the mechanical ventilator. The alarm sounds of each device were recorded using a monaural microphone placed at the head of a bed in a quiet private room in the hospital. The distance between the sound source and the microphone was the same as in a typical bedside setting (<xref ref-type="fig" rid="figure3">Figure 3</xref>). The sound pressure level was recorded simultaneously. The recording has a different number of active sound events superimposed on each frame. Therefore, the frame has various polyphony levels. The distribution of polyphony levels when recording the alarm sounds for the 7 devices is shown in <xref ref-type="table" rid="table2">Table 2</xref>. Audacity was used for labeling, extracting fundamental frequencies, and performing spectral analysis for annotation. <xref ref-type="table" rid="table3">Table 3</xref> shows the detailed characteristics of the recorded alarm sounds.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Recording environment. Numbers in parentheses represent the distance between the microphone and each device.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e35987_fig03.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Level of each sound relative to the total level of polyphonic sound.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Polyphony level</td><td align="left" valign="top">Data mount, n (%)</td></tr></thead><tbody><tr><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">0 (0)</td></tr><tr><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">3655 (5.7)</td></tr><tr><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">15,011 (23.6)</td></tr><tr><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">22,547 (35.4)</td></tr><tr><td align="char" char="." valign="top">4</td><td align="char" char="." valign="top">15,911 (25)</td></tr><tr><td align="char" char="." valign="top">5</td><td align="char" char="." valign="top">5280 (8.3)</td></tr><tr><td align="char" char="." valign="top">6</td><td align="char" char="." valign="top">1230 (1.9)</td></tr><tr><td align="char" char="." valign="top">7</td><td align="char" char="." valign="top">64 (0.1)</td></tr></tbody></table></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Detailed characteristics of the sounds of each alarm.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Source device</td><td align="left" valign="bottom">Model (manufacturer)</td><td align="left" valign="bottom">Peak frequencies (Hz)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">Signal duration (seconds)</td><td align="left" valign="bottom">Silence duration (seconds)</td><td align="left" valign="bottom">Overall duration (seconds)</td></tr></thead><tbody><tr><td align="left" valign="top">Infusion pump</td><td align="left" valign="top">OT-818G (JMS)</td><td align="left" valign="top">[(856,856,856)-(856,856)] 2 times</td><td align="left" valign="top">3.34</td><td align="left" valign="top">2.99</td><td align="left" valign="top">6.33</td></tr><tr><td align="left" valign="top">Syringe pump</td><td align="left" valign="top">TE-351 (Terumo)</td><td align="left" valign="top">4001</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.20</td><td align="left" valign="top">0.46</td></tr><tr><td align="left" valign="top">Enteral feeding pump</td><td align="left" valign="top">APPLIX Smart (Fresenius Kabi)</td><td align="left" valign="top">4097</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.80</td><td align="left" valign="top">1.60</td></tr><tr><td align="left" valign="top">Venous foot pump</td><td align="left" valign="top">SCD700 (Coviden)</td><td align="left" valign="top">2108</td><td align="left" valign="top">0.20</td><td align="left" valign="top">1.30</td><td align="left" valign="top">1.50</td></tr><tr><td align="left" valign="top">Chest drainage</td><td align="left" valign="top">THOPAZ (Medela)</td><td align="left" valign="top">2632</td><td align="left" valign="top">1.07</td><td align="left" valign="top">8.95</td><td align="left" valign="top">10.02</td></tr><tr><td align="left" valign="top">Patient monitor</td><td align="left" valign="top">PVM-4761 (NihonKoden)</td><td align="left" valign="top">[(783,994,1181)&#x2010;(1181,1569)] 2 times<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">2.93</td><td align="left" valign="top">4.09</td><td align="left" valign="top">7.02</td></tr><tr><td align="left" valign="top">Mechanical ventilator</td><td align="left" valign="top">C1 (Hamilton)</td><td align="left" valign="top">491 828 662<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">1.10</td><td align="left" valign="top">4.98</td><td align="left" valign="top">6.08</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Fundamental frequencies analyzed by Audacity. Harmonics are excluded.</p></fn><fn id="table3fn2"><p><sup>b</sup>High-priority alarm sound.</p></fn><fn id="table3fn3"><p><sup>c</sup>Middle-priority alarm sound.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3-2"><title>Simulation Dataset</title><p>The robustness of the classifier was evaluated using a simulation dataset of alarm sounds added to HWN (which comprised conversations, footsteps, closet opening and closing sounds, intraoral suction, and ventilator exhalation sounds but not alarm sounds from other medical devices) at different SNRs (<xref ref-type="fig" rid="figure1">Figure 1</xref>). For the simulation dataset, the alarm sounds were recorded separately from those in the Data Collection section, using the same recording protocol (quiet private room, identical microphone placement, and device settings). During recording, all 7 devices were set to repeatedly emit alarms, sometimes overlapping due to differences in their alarm durations. Multiple alarms sometimes sounded simultaneously because of differences in the duration times of each alarm sound. <xref ref-type="table" rid="table4">Table 4</xref> shows the maximum sound pressure of each sound source. In contrast to the training dataset&#x2014;where alarm sounds were superimposed on white noise at SNRs ranging from 30 to 0 dB in 1 dB steps&#x2014;the simulation dataset used for evaluation was created by superimposing the recorded HWN on alarm sounds at 4 SNR settings (30, 20, 10, and 0 dB) to reproduce realistic clinical environments. As shown in this figure, the VFP alarm exhibited the lowest sound pressure level among the devices, whereas other devices, such as SP and ENP, had relatively higher levels.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Maximum sound pressure levels of each of the devices in the recording simulation dataset.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Source device</td><td align="left" valign="top">Sound level (dB)</td></tr></thead><tbody><tr><td align="left" valign="top">Infusion pump</td><td align="left" valign="top">71.9</td></tr><tr><td align="left" valign="top">Syringe pump</td><td align="left" valign="top">77.7</td></tr><tr><td align="left" valign="top">Enteral feeding pump</td><td align="left" valign="top">64.4</td></tr><tr><td align="left" valign="top">Venous toot pump</td><td align="left" valign="top">61.2</td></tr><tr><td align="left" valign="top">Chest drainage</td><td align="left" valign="top">73.7</td></tr><tr><td align="left" valign="top">Patient monitor</td><td align="left" valign="top">62.1</td></tr><tr><td align="left" valign="top">Mechanical ventilator</td><td align="left" valign="top">79.6</td></tr><tr><td align="left" valign="top">Hospital ward noise</td><td align="left" valign="top">63.4</td></tr></tbody></table></table-wrap></sec><sec id="s2-3-3"><title>Performance Metrics</title><p>Model performance was evaluated using 5-fold cross-validation. No formal statistical significance tests were conducted, as the primary objective was descriptive benchmarking rather than hypothesis testing. Evaluation was performed using the sed_eval module [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. The classifier outputs each predicted value of the 7 devices. The predicted values were dichotomized based on a cutoff value of 0.5. The onset and offset times input into the sed_eval module were calculated from the change points of the output results. Segment-based metrics are an index that determines whether the reference and the output match for each set time resolution (second). Therefore, the time resolution was set to 0.1 seconds, half of the shortest alarm duration time among the 7 devices. The event-based metrics index evaluated the timing of onset and offset from the set collar (seconds). The collar was set to 2.0 seconds based on the response of the notification system. Event-based metrics evaluation considers the actual operation and is more stringent than segment-based evaluation. Comparisons were evaluated using overall metrics (microaverage) and class-wise metrics.</p><p>For clinical applicability, inaccurate information is unacceptable, as it puts patients at risk. Therefore, the requirement for clinical application was set at an <italic>F</italic><sub>1</sub>-score value of 0.900 or higher for event-based overall metrics and an <italic>F</italic> value of 0.950 or higher for class-wise recall metrics.</p><p>Finally, the predicted results were visualized using the sed_vis module, and the spectrogram and classification results were examined [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study did not involve human participants or animal experiments. The recordings contained no speech, personal identifiers, or patient-related information. Therefore, ethics approval was not required in accordance with institutional and international research ethics policies.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p><xref ref-type="table" rid="table5">Table 5</xref> summarizes the overall performance metrics at an SNR of 0 dB. The 679-second simulation dataset consisted of 63,649 frames, of which 63,488 frames were evaluated using 256-frame input windows. Among the segment-based metrics, CNN4+BiGRU achieved the highest <italic>F</italic><sub>1</sub>-score, followed by CNN3+BiGRU. Conversely, for event-based metrics, CNN3+BiGRU outperformed all other models, with CNN4+BiGRU ranking second. Regarding recall, event-based metrics showed values of 0.950 or higher for all BiGRU-based models, whereas segment-based metrics yielded recall values below 0.950 for all models.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Overall metrics (microaverage) at a signal-to-noise ratio of 0 dB across 5-fold cross-validation. The numbers in italics represent the optimal value for the proposed models.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Segment-based metrics, mean (SD)</td><td align="left" valign="bottom" colspan="3">Event-based metrics, mean (SD)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">Precision</td><td align="left" valign="top">Recall</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">Precision</td><td align="left" valign="top">Recall</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7"/></tr><tr><td align="left" valign="top">CNN3<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>+BiSRNN<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> (baseline)</td><td align="left" valign="top">0.832 (0.007)</td><td align="left" valign="top">0.822 (0.009)</td><td align="left" valign="top">0.841 (0.011)</td><td align="left" valign="top">0.921 (0.003)</td><td align="left" valign="top">0.874 (0.009)</td><td align="left" valign="top">0.973 (0.008)</td></tr><tr><td align="left" valign="top">CNN3+BiGRU<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">0.868 (0.008)</td><td align="left" valign="top">0.845 (0.006)</td><td align="left" valign="top">0.894 (0.012)</td><td align="left" valign="top"><italic>0.967 (0.011)</italic></td><td align="left" valign="top"><italic>0.944 (0.014)</italic></td><td align="left" valign="top"><italic>0.991 (0.011)</italic></td></tr><tr><td align="left" valign="top">CNN4+BiGRU</td><td align="left" valign="top"><italic>0.873 (0.004)</italic></td><td align="left" valign="top"><italic>0.856 (0.003)</italic></td><td align="left" valign="top">0.890 (0.008)</td><td align="left" valign="top">0.965 (0.008)</td><td align="left" valign="top">0.942 (0.009)</td><td align="left" valign="top">0.989 (0.008)</td></tr><tr><td align="left" valign="top">ALL-CNN4+BiGRU</td><td align="left" valign="top">0.867 (0.010)</td><td align="left" valign="top">0.839 (0.010)</td><td align="left" valign="top"><italic>0.896 (0.011)</italic></td><td align="left" valign="top">0.948 (0.021)</td><td align="left" valign="top">0.915 (0.032)</td><td align="left" valign="top">0.983 (0.008)</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>CNN: convolutional neural network.</p></fn><fn id="table5fn2"><p><sup>b</sup>BiSRNN: bidirectional simple recurrent neural network.</p></fn><fn id="table5fn3"><p><sup>c</sup>BiGRU: bidirectional gated recurrent unit.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table6">Table 6</xref> shows the event-based class-wise metrics at an SNR of 0 dB. Only event-based metrics are reported here, as they reflect the temporal accuracy of onset and offset detection, which is essential for clinical alarm management, whereas class-wise segment-based metrics are less indicative of operational performance. Class-wise segment-based results are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref30">30</xref>] for reference. Event-based evaluation showed that CNN3+BiGRU and CNN4+BiGRU had a recall value of 0.990 or higher for all devices but VFP. In contrast, the ENP, PM, and ventilator had <italic>F</italic><sub>1</sub>-scores of 0.900 or less due to their low precision. The reference standard was the correctly annotated label from the event roll that visualized the classification results, while the output was the model&#x2019;s identified results. In the absence of sound, the device was detected visually (<xref ref-type="fig" rid="figure4">Figure 4</xref>).</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Event-based class-wise metrics at a signal-to-noise ratio of 0 dB across 5-fold cross-validation. The numbers in italics represent the optimal value for each class.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Metrics and class</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score, mean (SD)</td><td align="left" valign="bottom">Precision, mean (SD)</td><td align="left" valign="bottom">Recall, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">CNN3<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>+BiSRNN<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>IP<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">0.945 (0.016)</td><td align="left" valign="top">0.900 (0.028)</td><td align="left" valign="top">0.995 (0.001)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SP<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">0.994 (0.004)</td><td align="left" valign="top">0.990 (0.008)</td><td align="left" valign="top"><italic>0.998 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ENP<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup></td><td align="left" valign="top">0.903 (0.011)</td><td align="left" valign="top">0.831 (0.016)</td><td align="left" valign="top">0.990 (0.007)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>VFP<sup><xref ref-type="table-fn" rid="table6fn6">f</xref></sup></td><td align="left" valign="top">0.829 (0.013)</td><td align="left" valign="top">0.828 (0.036)</td><td align="left" valign="top">0.834 (0.050)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CD<sup><xref ref-type="table-fn" rid="table6fn7">g</xref></sup></td><td align="left" valign="top">0.815 (0.023)</td><td align="left" valign="top">0.688 (0.034)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PM<sup><xref ref-type="table-fn" rid="table6fn8">h</xref></sup></td><td align="left" valign="top">0.854 (0.020)</td><td align="left" valign="top">0.749 (0.031)</td><td align="left" valign="top">0.993 (0.003)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vent<sup><xref ref-type="table-fn" rid="table6fn9">i</xref></sup></td><td align="left" valign="top">0.752 (0.058)</td><td align="left" valign="top">0.606 (0.078)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="5">CNN3+BiGRU<sup><xref ref-type="table-fn" rid="table6fn10">j</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>IP</td><td align="left" valign="top">0.982 (0.008)</td><td align="left" valign="top">0.967 (0.015)</td><td align="left" valign="top">0.997 (0.001)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SP</td><td align="left" valign="top"><italic>0.998 (0.000)</italic></td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td><td align="left" valign="top">0.997 (0.001)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ENP</td><td align="left" valign="top"><italic>0.969 (0.016)</italic></td><td align="left" valign="top"><italic>0.939 (0.029)</italic></td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>VFP</td><td align="left" valign="top">0.939 (0.037)</td><td align="left" valign="top">0.931 (0.030)</td><td align="left" valign="top"><italic>0.951 (0.074)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CD</td><td align="left" valign="top">0.904 (0.031)</td><td align="left" valign="top">0.826 (0.052)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PM</td><td align="left" valign="top">0.924 (0.025)</td><td align="left" valign="top">0.862 (0.043)</td><td align="left" valign="top"><italic>0.997 (0.001)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vent</td><td align="left" valign="top"><italic>0.834 (0.034)</italic></td><td align="left" valign="top"><italic>0.717 (0.050)</italic></td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="5">CNN4+BiGRU</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>IP</td><td align="left" valign="top"><italic>0.986 (0.005)</italic></td><td align="left" valign="top"><italic>0.977 (0.009)</italic></td><td align="left" valign="top">0.995 (0.002)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SP</td><td align="left" valign="top">0.998 (0.000)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td><td align="left" valign="top">0.997 (0.001)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ENP</td><td align="left" valign="top">0.939 (0.033)</td><td align="left" valign="top">0.887 (0.058)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>VFP</td><td align="left" valign="top"><italic>0.943 (0.022)</italic></td><td align="left" valign="top"><italic>0.947 (0.027)</italic></td><td align="left" valign="top">0.942 (0.054)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CD</td><td align="left" valign="top">0.870 (0.030)</td><td align="left" valign="top">0.770 (0.048)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PM</td><td align="left" valign="top"><italic>0.954 (0.014)</italic></td><td align="left" valign="top"><italic>0.916 (0.025)</italic></td><td align="left" valign="top">0.996 (0.002)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vent</td><td align="left" valign="top">0.797 (0.054)</td><td align="left" valign="top">0.666 (0.080)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="5">ALL-CNN4+BiGRU</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>IP</td><td align="left" valign="top">0.942 (0.030)</td><td align="left" valign="top">0.896 (0.052)</td><td align="left" valign="top">0.995 (0.003)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SP</td><td align="left" valign="top"><italic>0.998 (0.000)</italic></td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td><td align="left" valign="top">0.996 (0.001)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ENP</td><td align="left" valign="top">0.908 (0.056)</td><td align="left" valign="top">0.836 (0.090)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>VFP</td><td align="left" valign="top">0.921 (0.047)</td><td align="left" valign="top">0.943 (0.046)</td><td align="left" valign="top">0.901 (0.057)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CD</td><td align="left" valign="top"><italic>0.949 (0.018)</italic></td><td align="left" valign="top"><italic>0.903 (0.033)</italic></td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PM</td><td align="left" valign="top">0.904 (0.020)</td><td align="left" valign="top">0.829 (0.032)</td><td align="left" valign="top">0.995 (0.003)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vent</td><td align="left" valign="top">0.805 (0.051)</td><td align="left" valign="top">0.677 (0.074)</td><td align="left" valign="top"><italic>1.000 (0.000)</italic></td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>CNN: convolutional neural network.</p></fn><fn id="table6fn2"><p><sup>b</sup>BiSRNN: bidirectional simple recurrent neural network.</p></fn><fn id="table6fn3"><p><sup>c</sup>IP: infusion pump.</p></fn><fn id="table6fn4"><p><sup>d</sup>SP: syringe pump.</p></fn><fn id="table6fn5"><p><sup>e</sup>ENP: enteral feeding pump.</p></fn><fn id="table6fn6"><p><sup>f</sup>VFP: venous foot pump.</p></fn><fn id="table6fn7"><p><sup>g</sup>CD: chest drainage.</p></fn><fn id="table6fn8"><p><sup>h</sup>PM: patient monitor.</p></fn><fn id="table6fn9"><p><sup>i</sup>Vent: mechanical ventilator.</p></fn><fn id="table6fn10"><p><sup>j</sup>BiGRU: bidirectional gated recurrent unit.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>The reference and proposed model outputs of a convolutional neural network 4+bidirectional gated recurrent unit at a signal-to-noise ratio of 0 dB: (A) spectrogram of the simulation dataset and (B) event roll of the reference and proposed model outputs for the 7 devices. CD: chest drainage; ENP: enteral feeding pump; IP: infusion pump; PM: patient monitor; SP: syringe pump; VFP: venous foot pump; VENT: mechanical ventilator.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e35987_fig04.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>The proposed classifier was found to successfully detect the status of nearby medical devices. Although video-based alarm detection was also considered, it was not feasible to detect the monitors with cameras without the introduction of a blind spot. Therefore, SED was considered a more feasible approach than video recognition.</p><p>This study describes the construction of a classifier using a large amount of artificial noise data. This classifier was used to evaluate polyphonic alarm sounds among a simulation dataset of HWN. Because the classification target is a sine wave, we expected that the target could be achieved with simple neural networks. However, BiSRNN did not achieve the target performance, requiring the application of BiGRU. Both the frequency and time axes required advanced processing to recognize alarm sounds. In addition, a convolutional layer with 4 layers outperformed one with 3 layers. The <italic>F</italic><sub>1</sub>-score of the classifier using BiGRU was 0.900 or higher, which was robust in the detection of polyphonic alarm sounds. Only VFP in each proposed classifier was unclear in the spectrogram due to VFP having the lowest sound pressure level of the 7 devices, thus resulting in low recall. In addition, masking of an impact sound over the entire frequency axis of the spectrogram would result in lower precision and overdetection.</p></sec><sec id="s4-2"><title>Clinical Application as a Notification System</title><p>The notification system should alert the hospital information system without missing any situations, whether alarms are sounding on a single device or on multiple devices simultaneously. Therefore, recall of class-wise metrics was considered the most important in evaluating clinical applications. In particular, recall for the ventilator, the most essential support device of the 7 evaluated, was 1.000.</p><p>CNN4+BiGRU had the highest <italic>F</italic><sub>1</sub>-score in the event-based overall metrics, and the recall values of ENP, CD, and the ventilator were 1.000 each. Only VFP showed a recall value below 0.950, primarily due to its lower sound pressure level. In some cases, detection occurred slightly earlier than the actual onset, which reduced the measured recall. However, the ventilator demonstrated low precision with frequent overdetections, suggesting that it could not be evaluated clinically because it could cause alarm fatigue [<xref ref-type="bibr" rid="ref35">35</xref>]. In contrast, because the <italic>F</italic><sub>1</sub>-scores of ENP, VFP, and CD without external output were 0.900 or higher, the system was likely feasible to notify staff of the alarm status of medical devices that could not be connected to the hospital information system. Therefore, the proposed system demonstrated feasibility as an alarm sound detection system and can be further refined for clinical use.</p></sec><sec id="s4-3"><title>Integration Into Hospital Networks</title><sec id="s4-3-1"><title>Data Privacy Implications</title><p>The system processes only nonspeech alarm signals, ensuring that no patient-identifiable audio is stored or transmitted. When integrated into hospital networks, alarm classifications should be anonymized at the edge, transmitting only classification results in compliance with privacy regulations such as the General Data Protection Regulation and Japan&#x2019;s Act on the Protection of Personal Information.</p></sec><sec id="s4-3-2"><title>Real-Time Processing Feasibility</title><p>Our classifier was designed for lightweight deployment, with feature extraction and model inference feasible on edge devices. The processing latency is on the order of milliseconds per input window, enabling real-time alarm monitoring without delaying clinical response.</p></sec><sec id="s4-3-3"><title>Regulatory Concerns</title><p>Integration of an alarm detection system into hospital infrastructure would require compliance with medical device standards, including ISO/IEC 60601-1-8 for alarm systems. Depending on the jurisdiction, such a system may be classified as a medical device software, requiring regulatory approval. Early consultation with regulatory authorities is recommended to ensure compliance and patient safety.</p></sec></sec><sec id="s4-4"><title>Limitations</title><p>This study had several limitations. First, the study evaluated only a limited range of medical devices from a single hospital, potentially limiting generalizability. Second, differences in hospital architecture (eg, room layouts, wall materials, and ambient noise) may affect sound propagation and detection. Third, despite ISO/IEC 60601-1-8 regulations, variations in alarm sounds across manufacturers and models were not assessed. Finally, large-scale live recording and annotation remain impractical; future work should use synthetic datasets and validate performance across diverse environments and device types.</p></sec><sec id="s4-5"><title>Comparison With Prior Work</title><p>Several reports have examined the classification of alarm sounds using deep learning. Evaluation of single sounds of horns and bicycle bells found that the 5-layer deep neural network that applied an integrated judgment process had <italic>F</italic><sub>1</sub>-scores of 0.99 or higher [<xref ref-type="bibr" rid="ref4">4</xref>]. Because horns and similar devices do not create sine waves of digital sounds, they cannot be evaluated in the same way as alarm sounds of medical devices. A study of alarm sounds of medical devices in a neonatal intensive care unit found that most of the alarms had SNRs of 0 dB or higher [<xref ref-type="bibr" rid="ref8">8</xref>]. That study, however, did not consider a classifier for polyphonic alarm sounds. A classifier using CRNN was found to be effective for polyphonic acoustic sounds [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p></sec><sec id="s4-6"><title>Conclusions</title><p>Missed medical device alarms can lead to serious adverse events. To mitigate such risks, we developed a deep learning&#x2013;based classifier for detecting polyphonic alarm sounds in hospital environments.</p><p>Alarm sounds emitted by medical devices are regulated by ISO/IEC 60601-1-8. Because this standard defines different tones and patterns for each device and priority, SED is expected to identify the device and priority successfully. Thus, we considered SED appropriate to determine alarm status. Automatic identification of alarm sounds in hospital rooms would facilitate safer medical care.</p><p>In the simulation experiment, the polyphonic alarm sound classifier showed excellent performance, with an <italic>F</italic><sub>1</sub>-score of 0.945 at an SNR of 0 dB. The proposed classifier demonstrated feasibility for clinical alarm sound detection and can be further optimized. When combined with network connectivity, this classifier could improve the notification of abnormal patient status detected by medical devices without requiring each device to be individually connected.</p></sec></sec></body><back><ack><p>This work was partially supported by the Japan Society for the Promotion of Science KAKENHI program (JP24K21125).</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BiGRU</term><def><p>bidirectional gated recurrent unit</p></def></def-item><def-item><term id="abb2">BiSRNN</term><def><p>bidirectional simple recurrent neural network</p></def></def-item><def-item><term id="abb3">CD</term><def><p>chest drainage</p></def></def-item><def-item><term id="abb4">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb5">CRNN</term><def><p>convolutional recurrent neural network</p></def></def-item><def-item><term id="abb6">ENP</term><def><p>enteral feeding pump</p></def></def-item><def-item><term id="abb7">HWN</term><def><p>hospital ward noise</p></def></def-item><def-item><term id="abb8">IEC</term><def><p>International Electrotechnical Commission</p></def></def-item><def-item><term id="abb9">IP</term><def><p>infusion pump</p></def></def-item><def-item><term id="abb10">ISO</term><def><p>International Organization for Standardization</p></def></def-item><def-item><term id="abb11">MFB</term><def><p>mel filter bank</p></def></def-item><def-item><term id="abb12">PM</term><def><p>patient monitor</p></def></def-item><def-item><term id="abb13">RNN</term><def><p>recurrent neural network</p></def></def-item><def-item><term id="abb14">SED</term><def><p>sound event detection</p></def></def-item><def-item><term id="abb15">SNR</term><def><p>signal-to-noise ratio</p></def></def-item><def-item><term id="abb16">SP</term><def><p>syringe pump</p></def></def-item><def-item><term id="abb17">VFP</term><def><p>venous foot pump</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yoshioka</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ishiyama</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saitoh</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Development and testing of a ventilator remote adapter via communication between the ventilator and receiving smart device utilizing the accompanying app</article-title><source>Jpn J Med Instrum</source><year>2018</year><volume>88</volume><issue>4</issue><fpage>449</fpage><lpage>457</lpage><pub-id pub-id-type="doi">10.4286/jjmi.88.449</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Michiyoshi</surname><given-names>S</given-names> </name></person-group><article-title>Inclusion of infusion pump in automatic control system</article-title><source>Jpn J Intensive Care Med</source><year>2020</year><access-date>2025-11-10</access-date><volume>44</volume><issue>3</issue><fpage>117</fpage><lpage>122</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://jglobal.jst.go.jp/en/detail?JGLOBAL_ID=202002286048854088">https://jglobal.jst.go.jp/en/detail?JGLOBAL_ID=202002286048854088</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ishikawa</surname><given-names>M</given-names> </name><name name-style="western"><surname>Saito</surname><given-names>N</given-names> </name></person-group><article-title>Strategies to prevent recurrence of incidents and accidents related to medical device alarm systems</article-title><source>Iryo Kikigaku</source><year>2017</year><volume>87</volume><issue>3</issue><fpage>285</fpage><lpage>291</lpage><pub-id pub-id-type="doi">10.4286/jjmi.87.285</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shiraishi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Takeda</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shitara</surname><given-names>A</given-names> </name></person-group><article-title>Alarm sound classification system in smartphones for the deaf and hard-of-hearing using deep neural networks</article-title><access-date>2025-10-17</access-date><conf-name>International Conference on Advances in Computer-Human Interactions</conf-name><conf-date>Nov 21-25, 2020</conf-date><conf-loc>Valencia, Spain</conf-loc><fpage>30</fpage><lpage>33</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.thinkmind.org/index.php?view=article&#x0026;articleid=achi_2020_3_10_28007">https://www.thinkmind.org/index.php?view=article&#x0026;articleid=achi_2020_3_10_28007</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cakir</surname><given-names>E</given-names> </name><name name-style="western"><surname>Heittola</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huttunen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>Multi-label vs. combined single-label sound event detection with deep neural networks</article-title><conf-name>European Signal Processing Conference</conf-name><conf-date>Aug 31 to Sep 4, 2015</conf-date><conf-loc>Nice</conf-loc><fpage>2551</fpage><lpage>2555</lpage><pub-id pub-id-type="doi">10.1109/EUSIPCO.2015.7362845</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cakir</surname><given-names>E</given-names> </name><name name-style="western"><surname>Heittola</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huttunen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>Polyphonic sound event detection using multi label deep neural networks</article-title><conf-name>International Joint Conference on Neural Networks</conf-name><conf-date>Jul 12-17, 2015</conf-date><conf-loc>Killarney, Ireland</conf-loc><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1109/IJCNN.2015.7280624</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesaros</surname><given-names>A</given-names> </name><name name-style="western"><surname>Heittola</surname><given-names>T</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>Metrics for polyphonic sound event detection</article-title><source>Appl Sci (Basel)</source><year>2016</year><month>06</month><volume>6</volume><issue>6</issue><fpage>162</fpage><pub-id pub-id-type="doi">10.3390/app6060162</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raboshchuk</surname><given-names>G</given-names> </name><name name-style="western"><surname>Nadeu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Jancovic</surname><given-names>P</given-names> </name><etal/></person-group><article-title>A knowledge-based approach to automatic detection of equipment alarm sounds in a neonatal intensive care unit environment</article-title><source>IEEE J Transl Eng Health Med</source><year>2018</year><volume>6</volume><issue>4400110</issue><pub-id pub-id-type="doi">10.1109/JTEHM.2017.2781224</pub-id><pub-id pub-id-type="medline">29404227</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McLoughlin</surname><given-names>I</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>W</given-names> </name></person-group><article-title>Robust sound event classification using deep neural networks</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2015</year><month>03</month><volume>23</volume><issue>3</issue><fpage>540</fpage><lpage>552</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2015.2389618</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Phan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hertel</surname><given-names>L</given-names> </name><name name-style="western"><surname>Maass</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mertins</surname><given-names>A</given-names> </name></person-group><article-title>Robust audio event recognition with 1-max pooling convolutional neural networks</article-title><conf-name>Annual Conference of the International Speech Communication Association</conf-name><conf-date>Sep 8-12, 2016</conf-date><pub-id pub-id-type="doi">10.21437/Interspeech.2016-123</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kishimoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Takemura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sugiyama</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Prediction of polyphonic alarm sound by deep neural networks</article-title><source>Jpn Soc Med Biol Eng</source><year>2022</year><month>03</month><volume>60</volume><issue>1</issue><fpage>8</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.11239/jsmbe.60.8</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>YA</given-names> </name><name name-style="western"><surname>Glass</surname><given-names>J</given-names> </name></person-group><article-title>AST: audio spectrogram transformer</article-title><conf-name>Annual Conference of the International Speech Communication Association</conf-name><conf-date>Aug 30 to Sep 3, 2021</conf-date><pub-id pub-id-type="doi">10.21437/Interspeech.2021-698</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gulati</surname><given-names>A</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chiu</surname><given-names>CC</given-names> </name><etal/></person-group><article-title>Conformer: convolution-augmented transformer for speech recognition</article-title><conf-name>Annual Conference of the International Speech Communication Association</conf-name><conf-date>Oct 25-29, 2020</conf-date><pub-id pub-id-type="doi">10.21437/Interspeech.2020-3015</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mesaros</surname><given-names>A</given-names> </name><name name-style="western"><surname>Heittola</surname><given-names>T</given-names> </name><name name-style="western"><surname>Eronen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>Acoustic event detection in real life recordings</article-title><access-date>2025-10-17</access-date><conf-name>European Signal Processing Conference</conf-name><conf-date>Aug 23-27, 2010</conf-date><conf-loc>Aalborg, Denmark</conf-loc><fpage>1267</fpage><lpage>1271</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://ieeexplore.ieee.org/document/7096611">https://ieeexplore.ieee.org/document/7096611</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Heittola</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mesaros</surname><given-names>A</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gabbouj</surname><given-names>M</given-names> </name></person-group><article-title>Supervised model training for overlapping sound events based on unsupervised source separation</article-title><conf-name>IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name><conf-date>May 26-31, 2013</conf-date><conf-loc>Vancouver, BC</conf-loc><fpage>8677</fpage><lpage>8681</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2013.6639360</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Innami</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kasai</surname><given-names>H</given-names> </name></person-group><article-title>NMF-based environmental sound source separation using time-variant gain features</article-title><source>Comput Math Appl</source><year>2012</year><month>09</month><volume>64</volume><issue>5</issue><fpage>1333</fpage><lpage>1342</lpage><pub-id pub-id-type="doi">10.1016/j.camwa.2012.03.077</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dessein</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cont</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lemaitre</surname><given-names>G</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Nielsen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bhatia</surname><given-names>R</given-names> </name></person-group><article-title>Real-time detection of overlapping sound events with non-negative matrix factorization</article-title><source>Matrix Information Geometry</source><year>2013</year><publisher-name>Springer</publisher-name><fpage>341</fpage><lpage>371</lpage><pub-id pub-id-type="doi">10.1007/978-3-642-30232-9_14</pub-id><pub-id pub-id-type="other">978364230232</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mesaros</surname><given-names>A</given-names> </name><name name-style="western"><surname>Heittola</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dikmen</surname><given-names>O</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>Sound event detection in real life recordings using coupled matrix factorization of spectral representations and class activity annotations</article-title><conf-name>IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name><conf-date>Apr 19-24, 2015</conf-date><conf-loc>Queensland, Australia</conf-loc><fpage>151</fpage><lpage>155</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2015.7177950</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Graves</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schmidhuber</surname><given-names>J</given-names> </name></person-group><article-title>Framewise phoneme classification with bidirectional LSTM and other neural network architectures</article-title><source>Neural Netw</source><year>2005</year><volume>18</volume><issue>5-6</issue><fpage>602</fpage><lpage>610</lpage><pub-id pub-id-type="doi">10.1016/j.neunet.2005.06.042</pub-id><pub-id pub-id-type="medline">16112549</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Graves</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A rahman</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name></person-group><article-title>Speech recognition with deep recurrent neural networks</article-title><conf-name>International Conference on Acoustics, Speech and Signal Processing</conf-name><conf-date>May 26-31, 2013</conf-date><conf-loc>Vancouver, BC</conf-loc><fpage>6645</fpage><lpage>6649</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2013.6638947</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bock</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schedl</surname><given-names>M</given-names> </name></person-group><article-title>Polyphonic piano note transcription with recurrent neural networks</article-title><conf-name>IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name><conf-date>Mar 25-30, 2012</conf-date><conf-loc>Kyoto, Japan</conf-loc><fpage>121</fpage><lpage>124</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2012.6287832</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Valenti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Squartini</surname><given-names>S</given-names> </name><name name-style="western"><surname>Diment</surname><given-names>A</given-names> </name><name name-style="western"><surname>Parascandolo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>A convolutional neural network approach for acoustic scene classification</article-title><conf-name>International Joint Conference on Neural Networks</conf-name><conf-date>May 14-19, 2017</conf-date><conf-loc>Anchorage, AK</conf-loc><fpage>1547</fpage><lpage>1554</lpage><pub-id pub-id-type="doi">10.1109/IJCNN.2017.7966035</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sainath</surname><given-names>TN</given-names> </name><name name-style="western"><surname>Vinyals</surname><given-names>O</given-names> </name><name name-style="western"><surname>Senior</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sak</surname><given-names>H</given-names> </name></person-group><article-title>Convolutional, long short-term memory, fully connected deep neural networks</article-title><conf-name>IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name><conf-date>Apr 19-24, 2015</conf-date><conf-loc>Queensland, Australia</conf-loc><fpage>4580</fpage><lpage>4584</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2015.7178838</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cakir</surname><given-names>E</given-names> </name><name name-style="western"><surname>Parascandolo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Heittola</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huttunen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>Convolutional recurrent neural networks for polyphonic sound event detection</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2017</year><volume>25</volume><issue>6</issue><fpage>1291</fpage><lpage>1303</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2017.2690575</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Purwins</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>B</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Schluter</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>SY</given-names> </name><name name-style="western"><surname>Sainath</surname><given-names>T</given-names> </name></person-group><article-title>Deep learning for audio signal processing</article-title><source>IEEE J Sel Top Signal Process</source><year>2019</year><month>05</month><volume>13</volume><issue>2</issue><fpage>206</fpage><lpage>219</lpage><pub-id pub-id-type="doi">10.1109/JSTSP.2019.2908700</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Adavanne</surname><given-names>S</given-names> </name><name name-style="western"><surname>Virtanen</surname><given-names>T</given-names> </name></person-group><article-title>A report on sound event detection with different binaural features</article-title><source>arXiv</source><access-date>2021-09-09</access-date><comment>Preprint posted online on  Oct 9, 2017</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1710.02997">http://arxiv.org/abs/1710.02997</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>LeCun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name></person-group><article-title>Deep learning</article-title><source>Nature New Biol</source><year>2015</year><month>05</month><day>28</day><volume>521</volume><issue>7553</issue><fpage>436</fpage><lpage>444</lpage><pub-id pub-id-type="doi">10.1038/nature14539</pub-id><pub-id pub-id-type="medline">26017442</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Springenberg</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Dosovitskiy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brox</surname><given-names>T</given-names> </name><name name-style="western"><surname>Riedmiller</surname><given-names>M</given-names> </name></person-group><article-title>Striving for simplicity: the all convolutional net</article-title><source>arXiv</source><access-date>2021-09-18</access-date><comment>Preprint posted online on  Apr 13, 2015</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1412.6806">http://arxiv.org/abs/1412.6806</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kingma</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Ba</surname><given-names>J</given-names> </name></person-group><article-title>Adam: a method for stochastic optimization</article-title><source>arXiv</source><access-date>2021-06-07</access-date><comment>Preprint posted online on  Jan 29, 2017</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1412.6980">http://arxiv.org/abs/1412.6980</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>Ce-kishi/FEDA-medalarms</article-title><source>GitHub</source><access-date>2025-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/ce-kishi/FEDA-MedAlarms">https://github.com/ce-kishi/FEDA-MedAlarms</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salamon</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bello</surname><given-names>JP</given-names> </name></person-group><article-title>Deep convolutional neural networks and data augmentation for environmental sound classification</article-title><source>IEEE Signal Process Lett</source><year>2017</year><month>03</month><volume>24</volume><issue>3</issue><fpage>279</fpage><lpage>283</lpage><pub-id pub-id-type="doi">10.1109/LSP.2017.2657381</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>SpecAugment: a simple data augmentation method for automatic speech recognition</article-title><conf-name>Annual Conference of the International Speech Communication Association</conf-name><conf-date>Sep 15-19, 2019</conf-date><conf-loc>Graz, Austria</conf-loc><fpage>2613</fpage><lpage>2617</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2019">https://www.isca-archive.org/interspeech_2019</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2019-2680</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><source>TUT-ARG/sed_eval</source><year>2021</year><access-date>2021-09-19</access-date><publisher-name>GitHub</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/TUT-ARG/sed_eval">https://github.com/TUT-ARG/sed_eval</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><source>TUT-ARG/sed_vis</source><year>2021</year><access-date>2021-09-19</access-date><publisher-name>GitHub</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/TUT-ARG/sed_vis">https://github.com/TUT-ARG/sed_vis</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scott</surname><given-names>JB</given-names> </name><name name-style="western"><surname>De Vaux</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dills</surname><given-names>C</given-names> </name><name name-style="western"><surname>Strickland</surname><given-names>SL</given-names> </name></person-group><article-title>Mechanical ventilation alarms and alarm fatigue</article-title><source>Respir Care</source><year>2019</year><month>10</month><volume>64</volume><issue>10</issue><fpage>1308</fpage><lpage>1313</lpage><pub-id pub-id-type="doi">10.4187/respcare.06878</pub-id><pub-id pub-id-type="medline">31213570</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesaros</surname><given-names>A</given-names> </name><name name-style="western"><surname>Diment</surname><given-names>A</given-names> </name><name name-style="western"><surname>Elizalde</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Sound event detection in the DCASE 2017 challenge</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2019</year><month>06</month><volume>27</volume><issue>6</issue><fpage>992</fpage><lpage>1006</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2019.2907016</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Segment-based class-wise metrics at a signal-to-noise ratio of 0 dB across 5-fold cross-validation.</p><media xlink:href="medinform_v13i1e35987_app1.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material></app-group></back></article>