<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i2e25530</article-id>
      <article-id pub-id-type="pmid">33616536</article-id>
      <article-id pub-id-type="doi">10.2196/25530</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Similarity-Based Unsupervised Spelling Correction Using BioWordVec: Development and Usability Study of Bacterial Culture and Antimicrobial Susceptibility Reports</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chen</surname>
            <given-names>Qingyu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>Taehyeong</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3499-9444</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Han</surname>
            <given-names>Sung Won</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0040-3542</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Kang</surname>
            <given-names>Minji</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8800-2585</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Se Ha</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6808-4990</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>Jong-Ho</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1309-0821</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Joo</surname>
            <given-names>Hyung Joon</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <address>
            <institution>Department of Cardiology</institution>
            <institution>Cardiovascular Center</institution>
            <institution>Korea University College of Medicine</institution>
            <addr-line>73, Inchon-ro</addr-line>
            <addr-line>Seongbuk-gu, 02841</addr-line>
            <country>Republic of Korea</country>
            <phone>82 2 920 6411</phone>
            <email>drjoohj@gmail.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1846-8464</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Sohn</surname>
            <given-names>Jang Wook</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4792-0456</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Division of Industrial Management Engineering</institution>
        <institution>Korea University</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Information Computing Office</institution>
        <institution>Korea University Anam Hospital</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Cardiology</institution>
        <institution>Cardiovascular Center</institution>
        <institution>Korea University College of Medicine</institution>
        <addr-line>Seongbuk-gu</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Division of Infectious Disease</institution>
        <institution>Department of Internal Medicine</institution>
        <institution>Korea University College of Medicine</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hyung Joon Joo <email>drjoohj@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>22</day>
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>2</issue>
      <elocation-id>e25530</elocation-id>
      <history>
        <date date-type="received">
          <day>7</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>12</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>12</day>
          <month>1</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>20</day>
          <month>1</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Taehyeong Kim, Sung Won Han, Minji Kang, Se Ha Lee, Jong-Ho Kim, Hyung Joon Joo, Jang Wook Sohn. Originally published in JMIR Medical Informatics (http://medinform.jmir.org), 22.02.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on http://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/2/e25530" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Existing bacterial culture test results for infectious diseases are written in unrefined text, resulting in many problems, including typographical errors and stop words. Effective spelling correction processes are needed to ensure the accuracy and reliability of data for the study of infectious diseases, including medical terminology extraction. If a dictionary is established, spelling algorithms using edit distance are efficient. However, in the absence of a dictionary, traditional spelling correction algorithms that utilize only edit distances have limitations.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this research, we proposed a similarity-based spelling correction algorithm using pretrained word embedding with the BioWordVec technique. This method uses a character-level N-grams–based distributed representation through unsupervised learning rather than the existing rule-based method. In other words, we propose a framework that detects and corrects typographical errors when a dictionary is not in place.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>For detected typographical errors not mapped to Systematized Nomenclature of Medicine (SNOMED) clinical terms, a correction candidate group with high similarity considering the edit distance was generated using pretrained word embedding from the clinical database. From the embedding matrix in which the vocabulary is arranged in descending order according to frequency, a grid search was used to search for candidate groups of similar words. Thereafter, the correction candidate words were ranked in consideration of the frequency of the words, and the typographical errors were finally corrected according to the ranking.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Bacterial identification words were extracted from 27,544 bacterial culture and antimicrobial susceptibility reports, and 16 types of spelling errors and 914 misspelled words were found. The similarity-based spelling correction algorithm using BioWordVec proposed in this research corrected 12 types of typographical errors and showed very high performance in correcting 97.48% (based on F1 score) of all spelling errors.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This tool corrected spelling errors effectively in the absence of a dictionary based on bacterial identification words in bacterial culture and antimicrobial susceptibility reports. This method will help build a high-quality refined database of vast text data for electronic health records.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>spelling correction</kwd>
        <kwd>natural language processing</kwd>
        <kwd>bacteria</kwd>
        <kwd>electronic health record</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Among various industries, the medical industry produces many unstructured forms of examination reports. It is very important to establish a structured form of accurate medical documentation to provide accurate diagnoses and treatments to patients [<xref ref-type="bibr" rid="ref1">1</xref>]. False medical information because of spelling errors can lead to medical and/or treatment errors, resulting in serious risks for patients. For example, errors in the spelling of organism names or drugs with similar spelling in bacterial culture tests have negative effects on not only the diagnosis and treatment of patients, but also the management of infectious diseases and nosocomial infections in hospitals.</p>
        <p>While many patient electronic health records are documented in a structured form, the bacterial culture report is still stored as images or as an unrefined text data form in most hospitals. Mapping terms for bacterial identification are necessary to proceed with medical data studies, such as detection and diffusion path studies of infectious diseases. However, since large-scale clinical text data are mostly written by doctors or semiautomatic systems, there can be problems with data consistency, typographical errors, and stop words [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
        <p>In clinical text data, the extraction-transformation-load (ETL) process for medical terms is typically performed through exact string matching of words that appear in the dictionary. However, words not present in the dictionary or severely misspelled words have difficulty matching to terms. Because medical terms are complex and field specific, this problem makes it difficult to apply the same general data refining methods [<xref ref-type="bibr" rid="ref3">3</xref>]. Rule-based spelling correction algorithms cannot ensure the accuracy and reliability of the data because of incorrect data preprocessing. This method also has to check all test results and find the errors directly, resulting in a considerable cost problem.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <sec>
          <title>Spelling Correction in the Medical Domain</title>
          <p>It is very difficult to construct dictionaries for all medical terms and abbreviations. A related study of spelling correction algorithms specialized in medical record text data was conducted. Lai et al [<xref ref-type="bibr" rid="ref4">4</xref>] proposed a noisy channel-based spelling check algorithm for medical text. Named entity recognition (NER) was used to achieve an error detection performance of up to 94.4% with a spelling correction accuracy of up to 88.2%, producing high performance spelling correction results in various clinical documents. Fivez et al [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>] proposed a spelling check algorithm for clinical free text using fastText of the N-gram embedding technique. After generating misspelled words in MIMIC-III [<xref ref-type="bibr" rid="ref7">7</xref>] to measure similarity with the candidate group that fits the context, the similarity was ranked using the Damerau-Levenshtein distance. This method suggested a way to solve the out-of-vocabulary (OOV) problem in clinical data.</p>
        </sec>
        <sec>
          <title>Subword-Level Word Vector Representation</title>
          <p>Traditional spelling correction algorithms using edit distance or pronunciation algorithms have limitations in correcting word-level issues that fit the context. There are subword-level embedding methods for learning concurrent word information to consider context understanding. FastText [<xref ref-type="bibr" rid="ref8">8</xref>] expresses a word by the sum of the N-gram vector of the character level. The embedding method at the subword level solves the disadvantages that involve difficulty in application to languages with varying morphological changes or low frequency. This method was strong at solving the OOV problem, and accuracy was high for rare words in the word set. BioWordVec [<xref ref-type="bibr" rid="ref9">9</xref>] learns clinical record data from PubMed and MIMIC-III clinical databases using fastText. Based on 28,714,373 PubMed documents and 2,083,180 MIMIC-III clinical database documents, the entire corpus was built. The Medical Subject Headings (MeSH) term graph was organized to create a heading sequence and to carry out word embedding based on a sequence combining MeSH and PubMed. BioWordVec provided a 200-dimensional pretrained word embedding matrix.</p>
        </sec>
      </sec>
      <sec>
        <title>Limitations With Existing Approaches</title>
        <p>The method proposed by Lai et al [<xref ref-type="bibr" rid="ref4">4</xref>] has a limitation in that spelling corrections are not made in the absence of a dictionary. The method proposed by Fivez et al [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>] solves the OOV problem, but has a similar limitation in that spelling corrections are not made in the absence of a dictionary.</p>
      </sec>
      <sec>
        <title>Our Approach</title>
        <p>This paper proposes a similarity-based spelling correction algorithm through pretrained word embedding in medical field data. Using the BioWordVec model of the character level, which has pretrained clinical record data from the MIMIC-III clinical database, the model progresses learning on spelling corrections end-to-end. The proposed model has the advantage of being able to make spelling corrections in the absence of a dictionary. In addition, it is effective against new types of typographical errors that may occur in the future, and it is highly utilized in the field because it uses unsupervised learning that does not require direct label assignment. We aimed to use this model to develop a spelling correction system suitable for various types of medical text data.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Set</title>
        <sec>
          <title>Bacterial Culture and Antimicrobial Susceptibility Reports</title>
          <p>In this study, the bacterial culture and antimicrobial susceptibility reports from Korea University Anam Hospital, Korea University Guro Hospital, and Korea University Ansan Hospital were used. The bacterial culture and antimicrobial susceptibility report data were collected for 17 years (from 2002 to 2018), and in each year, reports for 1 month were used for the experiment. In total, 180,000 items were retrieved, with 27,544 having meaningful test results. Using the self-developed rule-based ETL algorithm [<xref ref-type="bibr" rid="ref10">10</xref>], unstructured bacterial culture and antimicrobial susceptibility reports were converted into structured text data. After preprocessing through lexical processing, such as sentence segmentation, tokenization, and stemming using regular expressions, there were 320 types of bacterial identification words in the report. Among the extracted bacterial identification words, 16 types of spelling errors and 914 misspelled words were found. <xref ref-type="table" rid="table1">Table 1</xref> presents the typographical errors based on their occurrence.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Misspelling frequency table.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Misspelling</td>
                  <td>Occurrence, n</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>staphylococcus</td>
                  <td>827</td>
                </tr>
                <tr valign="top">
                  <td>sstreptococcus</td>
                  <td>21</td>
                </tr>
                <tr valign="top">
                  <td>adecarboxylate</td>
                  <td>19</td>
                </tr>
                <tr valign="top">
                  <td>parpinfluenzae</td>
                  <td>18</td>
                </tr>
                <tr valign="top">
                  <td>papatyphi</td>
                  <td>7</td>
                </tr>
                <tr valign="top">
                  <td>pseudodiphthericum</td>
                  <td>6</td>
                </tr>
                <tr valign="top">
                  <td>urealyticm</td>
                  <td>5</td>
                </tr>
                <tr valign="top">
                  <td>chromogens</td>
                  <td>2</td>
                </tr>
                <tr valign="top">
                  <td>flavbacterium</td>
                  <td>2</td>
                </tr>
                <tr valign="top">
                  <td>ferentum</td>
                  <td>1</td>
                </tr>
                <tr valign="top">
                  <td>koneensis</td>
                  <td>1</td>
                </tr>
                <tr valign="top">
                  <td>ochrobacterium</td>
                  <td>1</td>
                </tr>
                <tr valign="top">
                  <td>orytihabitans</td>
                  <td>1</td>
                </tr>
                <tr valign="top">
                  <td>shingobacterium</td>
                  <td>1</td>
                </tr>
                <tr valign="top">
                  <td>stacherbrandfii</td>
                  <td>1</td>
                </tr>
                <tr valign="top">
                  <td>perosis</td>
                  <td>1</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Methodology</title>
        <sec>
          <title>Misspelling Detection</title>
          <p>Systematized Nomenclature of Medicine (SNOMED) clinical terms (CT) [<xref ref-type="bibr" rid="ref11">11</xref>] is a set of systematically structured medical terms used in medical clinical documents and reports. It is the world’s largest multilingual clinical terminology system. In the corpus constructed by tokenizing the bacterial identification result reports, words that were not mapped to SNOMED CT were defined and detected as typographical errors [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        </sec>
        <sec>
          <title>Candidate Generation</title>
          <p>Using the fastText [<xref ref-type="bibr" rid="ref8">8</xref>] technique, prelearned word embedding was used to generate a group of corrected word candidates with high similarity considering the edit distance. In this study, the BioWordVec [<xref ref-type="bibr" rid="ref9">9</xref>] model that was prelearned from the clinical database was used.</p>
          <p>The number of words that were most similar, cosine similarity, and edit distance were set as hyperparameters for generating a correction candidate group. In addition, constraints for candidate words were used based on the dictionary constructed for the existing general terms, the length of the word, and the frequency of the word. In this study, the number of most similar words was set to 30, cosine was set to 0.80, and edit distance was set to 3 as hyperparameters.</p>
          <p>Character-based spell checking algorithms were used to determine edit distances to generate or rank candidate groups. The Levenshtein edit distance [<xref ref-type="bibr" rid="ref13">13</xref>] is the number of operations required to convert one word into another. It can find the minimum editing distance that considers the insertion, deletion, replacement, and transposition (replacement of two adjacent characters) for most spelling errors. The model proposed in this paper uses the Damerau-Levenshtein distance [<xref ref-type="bibr" rid="ref14">14</xref>] as the edit distance. The formula is as follows:</p>
          <disp-formula>
            <graphic xlink:href="medinform_v9i2e25530_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </sec>
        <sec>
          <title>Candidate Ranking</title>
          <p>The final correction word is suggested by ranking the correction candidate groups. The pretrained word embedding was learned by the fastText technique, and the vocabulary was sorted in descending order according to frequency. The methodology proposed in this study has two assumptions. First, in clinical databases, correctly spelled words may appear relatively more frequently than misspelled words [<xref ref-type="bibr" rid="ref15">15</xref>]. Second, the larger the corpus used for learning, the greater the frequency of correctly spelled words [<xref ref-type="bibr" rid="ref15">15</xref>]. The BioWordVec [<xref ref-type="bibr" rid="ref9">9</xref>] model used in this research can sufficiently satisfy the above two assumptions.</p>
          <p>The model proposed in this research limited the search for the range of the most similar words. Through a grid search, a similarity-based candidate group that considers the frequency of words was proposed [<xref ref-type="bibr" rid="ref16">16</xref>]. After sorting the ranking of the generated correction candidate words based on similarity, typographical errors can be corrected.</p>
        </sec>
      </sec>
      <sec>
        <title>Overall Architecture</title>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> shows the architecture of the spelling correction algorithm proposed in this paper.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Similarity-based unsupervised spelling correction architecture. SNOMED: Systematized Nomenclature of Medicine.</p>
          </caption>
          <graphic xlink:href="medinform_v9i2e25530_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Experiments</title>
        <p>A typographical error that appears in bacterial culture and antimicrobial susceptibility reports is a word that can be corrected within three edit distances, as shown in <xref ref-type="table" rid="table2">Table 2</xref>. Most typographical errors have a correctly spelled word within one edit distance. Therefore, in the model proposed in this study, the critical value of the editing distance for generating the correction candidate group was set to 3 or less.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Correction table using edit distance.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Correction</td>
                <td>Edit distance</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>stapylococcus to staphylococcus</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>sstreptococcus to streptococcus</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>adecarboxylate to adecarboxylata</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>parpinfluenzae to parainfluenzae</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>papatyphi to paratyphi</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>pseudodiphtericum to pseudodiphtheriticum</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>urealyticm to urealyticum</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>chromogens to chromogenes</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>flavbacterium to flavobacterium</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>ferentum to fermentum</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>koneensis to koreensis</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>ochrobacterium to ochrobactrum</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>orytihabitans to oryzihabitans</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>shingobacterium to sphingobacterium</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>stacherbrandfii to stackebrandtii</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>perosis to peroris</td>
                <td>1</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <sec>
          <title>Comparison of Pretrained Embeddings</title>
          <p>All of the pretrained word embeddings used in this study were learned based on the fastText methodology, and the corpus was constructed without distinction between spelling errors and correct spelling during learning. To compare the performance of the BioWordVec model introduced in the previous study, four pretrained embeddings provided by Facebook were used.</p>
          <p>The following are the five pretrained embeddings: (1) BioWordVec, 200-dimensional embedding vectors learned using fastText for PubMed and MIMIC-III; (2) English word vectors, 300-dimensional embedding vectors learned using fastText for general text and from Wikipedia; (3) Crawled English subword vectors, 300-dimensional embedding vectors learned using fastText for the 2,000,000 lower words that appear in English word documents; (4) Wiki word vectors, 300-dimensional embedding vectors learned using fastText in Wikipedia; (5) Simple Wiki word vectors, 300-dimensional embedding vectors learned using fastText in Simple Wikipedia.</p>
          <p>The cosine similarity of all models was set to 0.80 or higher, the editing distance threshold was set to 3 or less, and the most similar words were tested under the same conditions with 30 words. The evaluation index is the exact spelling of the total 16 typographical errors that appear in the bacterial assimilation report with correction rate. <xref ref-type="table" rid="table3">Table 3</xref> shows the rate of correction for typographical errors according to pretrained embeddings.</p>
          <p>The spelling correction algorithm using BioWordVec showed very high performance compared to the performance of the other pretrained word embedding models. The methodology proposed in this study has the advantage of being used even in the absence of a dictionary. However, it was confirmed that pretrained word embedding based on the clinical database is necessary to correct errors in the bacterial identification report.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Comparison of pretrained embedding.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Pretrained embedding model</td>
                  <td>Correction rate</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>BioWordVec</td>
                  <td>0.75</td>
                </tr>
                <tr valign="top">
                  <td>English word vectors</td>
                  <td>0.00</td>
                </tr>
                <tr valign="top">
                  <td>Crawled English subword vectors</td>
                  <td>0.00</td>
                </tr>
                <tr valign="top">
                  <td>Wiki word vectors</td>
                  <td>0.31</td>
                </tr>
                <tr valign="top">
                  <td>Simple Wiki word vectors</td>
                  <td>0.19</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Evaluation</title>
        <p>Through a comparative experiment as shown in <xref ref-type="table" rid="table3">Table 3</xref>, it is possible to correct typographical errors using pretrained word embedding without building a dictionary. To evaluate the performance of the model proposed in this study, its performance was compared with a rule-based spelling correction algorithm [<xref ref-type="bibr" rid="ref17">17</xref>] using a dictionary and a situation without spelling correction. SymSpell [<xref ref-type="bibr" rid="ref18">18</xref>] was used as a spelling correction algorithm based on the edit distance rule.</p>
        <p>SymSpell [<xref ref-type="bibr" rid="ref18">18</xref>] can correct typographical errors 1 million times faster than rule-based spelling correction [<xref ref-type="bibr" rid="ref17">17</xref>] and can use existing dictionaries through a symmetric deletion spelling correction algorithm. SymSpell uses the Damerau-Levenshtein edit distance [<xref ref-type="bibr" rid="ref14">14</xref>], which was set to 3 for the experiment under the same conditions as the model proposed in this study. SCOWL [<xref ref-type="bibr" rid="ref19">19</xref>] and Dorland medical dictionary [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>] were used as dictionaries for SymSpell, and a total of 100,000 correct word dictionaries were constructed.</p>
        <p><xref ref-type="table" rid="table4">Table 4</xref> shows the evaluation results through the NER task that extracts the bacterial identification words. In the table, accuracy is the number of words corrected for all misspellings. Precision is the proportion of corrected words that the actual corrections match exactly. Recall is the proportion of correct corrected words among actual typographical errors. F1 score is the harmonic mean of precision and recall. SUSC (similarity-based unsupervised spelling correction) in <xref ref-type="table" rid="table4">Table 4</xref> is the model proposed in this study.</p>
        <p>In this study, the similarity-based spell checking algorithm SUSC using BioWordVec corrected 12 types of typographical errors and showed very high performance in correcting 97.48% (based on F1 score) of all spelling errors. Both models were able to correct frequent typographical errors, so the overall correction rate was high. However, since SymSpell only corrects certain words, the F1 score showed little difference compared with the nonspelling situation. The Dorland medical dictionary was not able to fully understand bacterial identification names for infectious diseases, and the rule-based spell checking algorithms using edit distance did not work well according to the established dictionaries. Constructing an accurate dictionary that can be used in a rule-based spell checking algorithm is very expensive and time consuming.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Model performance using BioWordVec.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="360"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
                <td>F1 score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>No spelling correction</td>
                <td>0.98</td>
                <td>0.94</td>
                <td>0.93</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td>SymSpell</td>
                <td>1.00</td>
                <td>0.94</td>
                <td>0.94</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td>SUSC<sup>a</sup> (BioWordVec)</td>
                <td>1.00</td>
                <td>0.97</td>
                <td>0.97</td>
                <td> 0.97</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>SUSC: similarity-based unsupervised spelling correction.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <sec>
          <title>Comparison of Similarity</title>
          <p>Using the SUSC model proposed in this study, the degree of similarity of words depending on correction was examined. <xref ref-type="table" rid="table5">Table 5</xref> shows the similarity of words according to whether they are corrected.</p>
          <p>As shown in <xref ref-type="table" rid="table5">Table 5</xref>, typographical errors that were not corrected with the correct spelling have low cosine similarity with the correctly spelled word as a whole. In the case of nonword errors, which involve words that do not actually exist, most of the words were corrected accurately. Miscorrected typographical errors included real-word errors where the word actually exists but is not appropriate for grammar or context. Since real-word errors are determined to be similar in meaning to words that do not fit the situation, the cosine similarity is relatively low for the word vector to be corrected. The model proposed in this study has the advantage of quantitatively expressing the relative distance between typographical errors and correctly spelled words by utilizing the similarity between words. Through the proposed model, it is possible to compare and determine whether the error detected with the framework is actually a typographical error that can occur often.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Comparison of similarity according to correction.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="250"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Change</td>
                  <td>Correction</td>
                  <td>Similarity</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>adecarboxylate to adecarboxylata</td>
                  <td>Corrected</td>
                  <td>0.90</td>
                </tr>
                <tr valign="top">
                  <td>flavbacterium to flavobacterium</td>
                  <td>Corrected</td>
                  <td>0.83</td>
                </tr>
                <tr valign="top">
                  <td>koneensis to koreensis</td>
                  <td>Corrected</td>
                  <td>0.87</td>
                </tr>
                <tr valign="top">
                  <td>ochrobacterium to ochrobactrum</td>
                  <td>Corrected</td>
                  <td>0.93</td>
                </tr>
                <tr valign="top">
                  <td>orytihabitans to oryzihabitans</td>
                  <td>Corrected</td>
                  <td>0.90</td>
                </tr>
                <tr valign="top">
                  <td>papatyphi to paratyphi</td>
                  <td>Corrected</td>
                  <td>0.89</td>
                </tr>
                <tr valign="top">
                  <td>parpinfluenzae to parainfluenzae</td>
                  <td>Corrected</td>
                  <td>0.86</td>
                </tr>
                <tr valign="top">
                  <td>pseudodiphtericum to pseudodiphtheriticum</td>
                  <td>Corrected</td>
                  <td>0.93</td>
                </tr>
                <tr valign="top">
                  <td>shingobacterium to sphingobacterium</td>
                  <td>Corrected</td>
                  <td>0.93</td>
                </tr>
                <tr valign="top">
                  <td>sstreptococcus to streptococcus</td>
                  <td>Corrected</td>
                  <td>0.95</td>
                </tr>
                <tr valign="top">
                  <td>stapylococcus to staphylococcus</td>
                  <td>Corrected</td>
                  <td>0.88</td>
                </tr>
                <tr valign="top">
                  <td>urealyticm to urealyticum</td>
                  <td>Corrected</td>
                  <td>0.84</td>
                </tr>
                <tr valign="top">
                  <td>chromogens to chromogenes</td>
                  <td>Not corrected</td>
                  <td>0.71</td>
                </tr>
                <tr valign="top">
                  <td>ferentum to fermentum</td>
                  <td>Not corrected</td>
                  <td>0.47</td>
                </tr>
                <tr valign="top">
                  <td>perosis to peroris</td>
                  <td>Not corrected</td>
                  <td>0.42</td>
                </tr>
                <tr valign="top">
                  <td>stacherbrandfii to stackebrandtii</td>
                  <td>Not corrected</td>
                  <td>0.59</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>It is difficult to compare our results with previous results because the system implementation and data set used in the related work are not publicly available. The model proposed in this research was capable of spelling correction through unsupervised learning, but it lacked the performance required for infrequent typographical errors and real-word errors. In addition, there was a problem of randomly setting the reference values for cosine similarity and edit distance when creating a correction candidate group. Methods should be devised to establish appropriate thresholds for hyperparameters through experiments.</p>
      <p>This research proposes a similarity-based spelling correction algorithm using pretrained word embedding to extract correct medical terminology from unstructured text data related to infectious diseases. The suggested algorithm has the advantage of being able to check spelling and make corrections in the absence of a correct spelling dictionary. In addition, it solves the OOV problem and can modify words based on context.</p>
      <p>As a result of the experiments conducted in this research, we were able to detect and correct spelling errors in the absence of a dictionary for bacterial terms appearing in bacterial culture and antimicrobial susceptibility reports. Our model efficiently refined and processed large medical text data. It has been proven experimentally that it is a method suitable for processing natural language involving high expertise and complexity, such as medical terminology. Ideally, the results of this research will serve as a foundation to build vast amounts of text data in electronic health records into high-quality databases.</p>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CT</term>
          <def>
            <p>clinical terms</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ETL</term>
          <def>
            <p>extract-transformation-load</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MeSH</term>
          <def>
            <p>Medical Subject Headings</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">OOV</term>
          <def>
            <p>out-of-vocabulary</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SNOMED</term>
          <def>
            <p>Systematized Nomenclature of Medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">SUSC</term>
          <def>
            <p>similarity-based unsupervised spelling correction</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research was supported by a grant from the Korea Health Technology R&#38;D Project through the Korea Health Industry Development Institute (KHIDI), funded by the Ministry of Health &#38; Welfare, Republic of Korea (grant number: HI19C0201, HI19C0360). This research was also supported by Brain Korea 21 FOUR.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Glance</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Statistical semantic and clinician confidence analysis for correcting abbreviations and spelling errors in clinical progress notes</article-title>
          <source>Artif Intell Med</source>
          <year>2011</year>
          <month>11</month>
          <volume>53</volume>
          <issue>3</issue>
          <fpage>171</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1016/j.artmed.2011.08.003</pub-id>
          <pub-id pub-id-type="medline">21924593</pub-id>
          <pub-id pub-id-type="pii">S0933-3657(11)00107-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hersh</surname>
              <given-names>WR</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Malveau</surname>
              <given-names>SE</given-names>
            </name>
          </person-group>
          <article-title>Assessing the feasibility of large-scale natural language processing in a corpus of ordinary medical records: a lexical analysis</article-title>
          <source>Proc AMIA Annu Fall Symp</source>
          <year>1997</year>
          <fpage>580</fpage>
          <lpage>4</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/9357692"/>
          </comment>
          <pub-id pub-id-type="medline">9357692</pub-id>
          <pub-id pub-id-type="pmcid">PMC2233467</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patrick</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sabbagh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Spelling correction in clinical notes with emphasis on first suggestion accuracy</article-title>
          <source>Proceedings of 2nd Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM2010)</source>
          <year>2010</year>
          <conf-name>2nd Workshop on Building and Evaluating Resources for Biomedical Text Mining</conf-name>
          <conf-date>2010</conf-date>
          <conf-loc>Malta</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Topaz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goss</surname>
              <given-names>FR</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Automated misspelling detection and correction in clinical free-text records</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>06</month>
          <volume>55</volume>
          <fpage>188</fpage>
          <lpage>95</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00075-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.04.008</pub-id>
          <pub-id pub-id-type="medline">25917057</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00075-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fivez</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Šuster</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Daelemans</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised Context-Sensitive Spelling Correction of English and Dutch Clinical Free-Text with Word and Character N-Gram Embeddings</article-title>
          <source>Computational Linguistics in the Netherlands Journal</source>
          <year>2017</year>
          <volume>7</volume>
          <fpage>39</fpage>
          <lpage>52</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.clinjournal.org/clinj/article/view/67"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fivez</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Šuster</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Daelemans</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised Context-Sensitive Spelling Correction of Clinical Free-Text with Word and Character N-Gram Embeddings</article-title>
          <source>Proceedings of the BioNLP 2017 Workshop</source>
          <year>2017</year>
          <conf-name>BioNLP 2017 Workshop</conf-name>
          <conf-date>August 4, 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>143</fpage>
          <lpage>148</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/W17-2317.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w17-2317</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AEW</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Enriching Word Vectors with Subword Information</article-title>
          <source>TACL</source>
          <year>2017</year>
          <month>12</month>
          <volume>5</volume>
          <fpage>135</fpage>
          <lpage>146</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1162/tacl_a_00051"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>BioWordVec, improving biomedical word embeddings with subword information and MeSH</article-title>
          <source>Sci Data</source>
          <year>2019</year>
          <month>05</month>
          <day>10</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>52</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-019-0055-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="medline">31076572</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-019-0055-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6510737</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
          <article-title>CDM_ETL-bacteria</article-title>
          <source>GitHub</source>
          <access-date>2021-01-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/KU-RIAS/CDM_ETL-bacteria">https://github.com/KU-RIAS/CDM_ETL-bacteria</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stearns</surname>
              <given-names>MQ</given-names>
            </name>
            <name name-style="western">
              <surname>Price</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Spackman</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>AY</given-names>
            </name>
          </person-group>
          <article-title>SNOMED clinical terms: overview of the development process and project status</article-title>
          <source>Proc AMIA Symp</source>
          <year>2001</year>
          <fpage>662</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/11825268"/>
          </comment>
          <pub-id pub-id-type="medline">11825268</pub-id>
          <pub-id pub-id-type="pii">D010001608</pub-id>
          <pub-id pub-id-type="pmcid">PMC2243297</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Context-Sensitive Spelling Correction of Consumer-Generated Content on Health Care</article-title>
          <source>JMIR Med Inform</source>
          <year>2015</year>
          <month>07</month>
          <day>31</day>
          <volume>3</volume>
          <issue>3</issue>
          <fpage>e27</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2015/3/e27/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/medinform.4211</pub-id>
          <pub-id pub-id-type="medline">26232246</pub-id>
          <pub-id pub-id-type="pii">v3i3e27</pub-id>
          <pub-id pub-id-type="pmcid">PMC4705358</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Levenshtein</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Binary codes capable of correcting deletions, insertions and reversals</article-title>
          <source>Soviet Physics Doklady</source>
          <year>1966</year>
          <month>02</month>
          <volume>10</volume>
          <fpage>707</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Damerau</surname>
              <given-names>FJ</given-names>
            </name>
          </person-group>
          <article-title>A technique for computer detection and correction of spelling errors</article-title>
          <source>Commun. ACM</source>
          <year>1964</year>
          <month>03</month>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>171</fpage>
          <lpage>176</lpage>
          <pub-id pub-id-type="doi">10.1145/363958.363994</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Crowell</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Ngo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lacroix</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>A frequency-based technique to improve the spelling suggestion rank in medical queries</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2004</year>
          <month>06</month>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>179</fpage>
          <lpage>85</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/14764616"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M1474</pub-id>
          <pub-id pub-id-type="medline">14764616</pub-id>
          <pub-id pub-id-type="pii">M1474</pub-id>
          <pub-id pub-id-type="pmcid">PMC400516</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pande</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Effective search space reduction for spell correction using character neural embeddings</article-title>
          <source>Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers</source>
          <year>2017</year>
          <conf-name>15th Conference of the European Chapter of the Association for Computational Linguistics</conf-name>
          <conf-date>April 2017</conf-date>
          <conf-loc>Valencia, Spain</conf-loc>
          <fpage>170</fpage>
          <lpage>174</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/E17-2027.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/e17-2027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <article-title>How to Write a Spelling Corrector</article-title>
          <source>Norvig</source>
          <access-date>2021-02-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://norvig.com/spell-correct.html">http://norvig.com/spell-correct.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <article-title>SymSpell</article-title>
          <source>GitHub</source>
          <access-date>2021-02-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/wolfgarbe/SymSpell">https://github.com/wolfgarbe/SymSpell</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>SCOWL (And Friends)</article-title>
          <source>Wordlist</source>
          <access-date>2021-02-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://wordlist.aspell.net/">http://wordlist.aspell.net/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nazir</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Qamar</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Zafar</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Shaheen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fatima</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Maqbool</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Automated misspelling detection and correction in clinical free-text records</article-title>
          <year>2018</year>
          <conf-name>2018 International Conference on Artificial Intelligence and Big Data (ICAIBD)</conf-name>
          <conf-date>May 26-28, 2018</conf-date>
          <conf-loc>Chengdu</conf-loc>
          <fpage>277</fpage>
          <lpage>280</lpage>
          <pub-id pub-id-type="doi">10.1109/icaibd.2018.8396209</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>Dorland</collab>
          </person-group>
          <source>Dorland's Dictionary of Medical Acronyms and Abbreviations - 7th Edition</source>
          <year>2015</year>
          <publisher-loc>Amsterdam</publisher-loc>
          <publisher-name>Elsevier</publisher-name>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
