<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i10e28752</article-id>
      <article-id pub-id-type="pmid">34709197</article-id>
      <article-id pub-id-type="doi">10.2196/28752</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Privacy-Preserving Anonymity for Periodical Releases of Spontaneous Adverse Drug Event Reporting Data: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Natsavias</surname>
            <given-names>Pantelis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Jie-Teng</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1584-4911</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Wen-Yang</given-names>
          </name>
          <degrees>PhD, Prof Dr</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Computer Science and Information Engineering</institution>
            <institution>National University of Kaohsiung</institution>
            <addr-line>700 Kaohsiung Univ. Rd, Nanzih District</addr-line>
            <addr-line>Kaohsiung, 811</addr-line>
            <country>Taiwan</country>
            <phone>886 7 5919517</phone>
            <fax>886 7 5919514</fax>
            <email>wylin@nuk.edu.tw</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3462-7744</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science and Information Engineering</institution>
        <institution>National University of Kaohsiung</institution>
        <addr-line>Kaohsiung</addr-line>
        <country>Taiwan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Wen-Yang Lin <email>wylin@nuk.edu.tw</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>10</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>28</day>
        <month>10</month>
        <year>2021</year>
      </pub-date>
      <volume>9</volume>
      <issue>10</issue>
      <elocation-id>e28752</elocation-id>
      <history>
        <date date-type="received">
          <day>13</day>
          <month>3</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>4</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>30</day>
          <month>7</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>2</day>
          <month>8</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Jie-Teng Wang, Wen-Yang Lin. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 28.10.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2021/10/e28752" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Spontaneous reporting systems (SRSs) have been increasingly established to collect adverse drug events for fostering adverse drug reaction (ADR) detection and analysis research. SRS data contain personal information, and so their publication requires data anonymization to prevent the disclosure of individuals’ privacy. We have previously proposed a privacy model called MS(<italic>k</italic>, <italic>θ*</italic>)-bounding and the associated MS-Anonymization algorithm to fulfill the anonymization of SRS data. In the real world, the SRS data usually are released periodically (eg, FDA Adverse Event Reporting System [FAERS]) to accommodate newly collected adverse drug events. Different anonymized releases of SRS data available to the attacker may thwart our single-release-focus method, that is, MS(<italic>k</italic>, <italic>θ*</italic>)-bounding.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We investigate the privacy threat caused by periodical releases of SRS data and propose anonymization methods to prevent the disclosure of personal privacy information while maintaining the utility of published data.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We identify potential attacks on periodical releases of SRS data, namely, BFL-attacks, mainly caused by follow-up cases. We present a new privacy model called PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding, and propose the associated PPMS-Anonymization algorithm and 2 improvements: PPMS+-Anonymization and PPMS++-Anonymization. Empirical evaluations were performed using 32 selected FAERS quarter data sets from 2004Q1 to 2011Q4. The performance of the proposed versions of PPMS-Anonymization was inspected against MS-Anonymization from some aspects, including data distortion, measured by normalized information loss; privacy risk of anonymized data, measured by dangerous identity ratio and dangerous sensitivity ratio; and data utility, measured by the bias of signal counting and strength (proportional reporting ratio).</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The best version of PPMS-Anonymization, PPMS++-Anonymization, achieves nearly the same quality as MS-Anonymization in both privacy protection and data utility. Overall, PPMS++-Anonymization ensures zero privacy risk on record and attribute linkage, and exhibits 51%-78% and 59%-82% improvements on information loss over PPMS+-Anonymization and PPMS-Anonymization, respectively, and significantly reduces the bias of ADR signal.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The proposed PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding model and PPMS-Anonymization algorithm are effective in anonymizing SRS data sets in the periodical data publishing scenario, preventing the series of releases from disclosing personal sensitive information caused by BFL-attacks while maintaining the data utility for ADR signal detection.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>adverse drug reaction</kwd>
        <kwd>data anonymization</kwd>
        <kwd>incremental data publishing</kwd>
        <kwd>privacy preserving data publishing</kwd>
        <kwd>spontaneous reporting system</kwd>
        <kwd>drug</kwd>
        <kwd>data set</kwd>
        <kwd>anonymous</kwd>
        <kwd>privacy</kwd>
        <kwd>security</kwd>
        <kwd>algorithm</kwd>
        <kwd>development</kwd>
        <kwd>validation</kwd>
        <kwd>data</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Motivation</title>
        <p>Adverse drug reactions (ADRs) are undesirable side effects of taking drugs. Before hitting the market, a new drug has to undergo a series of clinical trials. Unfortunately, it is hard to find all ADRs in the premarketing stage due to fewer volunteers. Thus, an increasing number of countries have built spontaneous reporting systems (SRSs) to collect adverse drug events (ADEs) to monitor the safety of marketed drugs, such as the FDA Adverse Event Reporting System (FAERS) of the US Food and Drug Administration (FDA) [<xref ref-type="bibr" rid="ref1">1</xref>], the UK Yellow Card scheme [<xref ref-type="bibr" rid="ref2">2</xref>], and the MedEffect Canada [<xref ref-type="bibr" rid="ref3">3</xref>]. Some countries even publish their SRS data sets, for example, US FDA and MedEffect Canada, to the public to facilitate ADR research.</p>
        <p>SRS data are a kind of microdata containing personal health information, such as diseases of the patients. Microdata, usually represented in the form of tables of tuples [<xref ref-type="bibr" rid="ref4">4</xref>], are composed of explicit identifier (<italic>ID</italic>) that can uniquely identify each individual (eg, SSN, name, phone number); quasi-identifier (<italic>QID</italic>) that can be linked with external data to reidentify some of the individuals (eg, sex, age, and ZIP code); sensitive attribute (<italic>SA</italic>) that contains sensitive information, such as disease or salary; and non-SA that falls into none of the above 3 categories. Publishing these data sets would lead to privacy threats. A real case did occur in Canada. A broadcaster successfully reidentified a 26-year-old girl by linking MedEffect Canada and the publicly available obituaries [<xref ref-type="bibr" rid="ref5">5</xref>]. This case motivated the research by El Emam et al [<xref ref-type="bibr" rid="ref5">5</xref>], whose findings showed that the MedEffect Canada data exhibit a high risk of identity disclosure.</p>
        <p>Generally, simple removal of the identification attributes, such as name, SSN, or phone, has been shown to fail to protect individual privacy [<xref ref-type="bibr" rid="ref6">6</xref>]. The adversary can still link published data to external data (eg, voter list, through quasi-identification attributes, such as gender, job, age, ZIP code). This calls for the research topic, namely, privacy-preserving data publishing (PPDP), which aims to anonymize raw data before publication. In [<xref ref-type="bibr" rid="ref7">7</xref>], we pointed out that none of traditional anonymization methods (eg, <italic>k</italic>-anonymity [<xref ref-type="bibr" rid="ref6">6</xref>], <italic>l</italic>-diversity [<xref ref-type="bibr" rid="ref8">8</xref>]) is favorable for SRS data sets due to characteristics such as multiple individual records, multivalued SAs, and rare events. Later, we proposed a privacy model called MS(<italic>k</italic>, <italic>θ*</italic>)-bounding [<xref ref-type="bibr" rid="ref9">9</xref>] to anonymize SRS data to prevent the disclosure of individual privacy. New events arrive in SRSs continuously in the real world, so countries such as the USA and Canada release SRS data sets periodically, for example, every quarter, to handle this kind of dynamically growing data sets (ie, periodical data publishing). Unfortunately, MS(<italic>k</italic>, <italic>θ*</italic>)-anonymity is designed for a single static publishing scenario, and is awkward to handle a series of published data sets.</p>
        <p>Usually, each ADE record in SRS data contains a CaseID to trace the follow-ups of that event; all records with the same CaseID, located within the same or different periods, refer to the same event. Although someone may regard follow-ups as duplicates of the original case, the situation is somewhat different. Follow-up cases contain complement or correction of the original case. Still, duplicate reports refer to the same case submitted by different reporters, so were misrecorded with different CaseIDs. Follow-ups are easily detected via CaseID, but identifying actual duplicates is challenging, which should be considered a data preprocessing issue. There has been some research studies on detecting actual duplicates in SRS data [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Most SRS systems such as FAERS, however, provide no deduplication mechanism. We thus ignore this issue. Unfortunately, CaseID provides a useful linkage for the adversary across a series of anonymized data sets to exclude records not belonging to the target, raising the risk of breaching the target’s privacy. For illustration, let us consider 3 consecutive quarters of published SRS data sets in <xref ref-type="table" rid="table1">Table 1</xref>, each of which satisfies 3-anonymity.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Three consecutive quarters of published spontaneous reporting system data sets, each satisfying 3-anonymity.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="260"/>
            <col width="210"/>
            <col width="210"/>
            <col width="290"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Quarter and CaseID</td>
                <td>Sex</td>
                <td>Age</td>
                <td>Disease</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>1</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1</td>
                <td>Male</td>
                <td>[35-40]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2</td>
                <td>Male</td>
                <td>[35-40]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3</td>
                <td>Male</td>
                <td>[35-40]</td>
                <td>Fever</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>4</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>HIV</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>5</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>6</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>Diabetes</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>2</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1</td>
                <td>ANY</td>
                <td>[30-40]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>4</td>
                <td>ANY</td>
                <td>[30-40]</td>
                <td>HIV</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>7</td>
                <td>ANY</td>
                <td>[30-40]</td>
                <td>Diabetes</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>8</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>Fever</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>9</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>10</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>Diabetes</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>11</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>HIV</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>12</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>3</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>13</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>14</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>Diabetes</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>15</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>Fever</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>16</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>Flu</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>17</td>
                <td>Female</td>
                <td>[30-35]</td>
                <td>Fever</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>7</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>Diabetes</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>8</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>Fever</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>18</td>
                <td>Male</td>
                <td>[30-35]</td>
                <td>HIV</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Possible Scenarios</title>
        <sec>
          <title>Scenario I</title>
          <p>Suppose that the adversary learns that his/her neighbor Alice, whose <italic>QID</italic> value is {Female, 32}, suffered from some ADR in Q2. First, the adversary links to <xref ref-type="table" rid="table1">Table 1</xref> (quarter 2) through the <italic>QID</italic> of Alice, learning that the record of Alice is in the first <italic>QID</italic> group (CaseIDs 1, 4, and 7). The adversary can then link to the previously published SRS data through the candidate CaseID set {1, 4, 7} and find the record with CaseID=1 and Sex=Male in <xref ref-type="table" rid="table1">Table 1</xref> (quarter 1). Because Alice is female, the adversary can exclude CaseID 1 from the candidate CaseID set {1, 4, 7}, changing <xref ref-type="table" rid="table1">Table 1</xref> (quarter 2) to 2-anonymous and lifting the confidence of the attacker to identify Alice.</p>
        </sec>
        <sec>
          <title>Scenario II</title>
          <p>Following the previous example, the adversary has known the candidate CaseID set of Alice {4, 7}. The adversary can now use this set to link to subsequently published SRS data and observe a record whose CaseID is 7 in <xref ref-type="table" rid="table1">Table 1</xref> (quarter 3). Because the owner of that record is male, the adversary can exclude CaseID 7 from the candidate CaseID set, concluding that the CaseID of Alice in <xref ref-type="table" rid="table1">Table 1</xref> (quarter 2) is 4.</p>
        </sec>
        <sec>
          <title>Scenario III</title>
          <p>Suppose that the adversary learns John’s <italic>QID</italic> value is {Male, 33} and the first time that John had an ADR is in Q3. This means that the CaseID of John’s event is a “new CaseID” in Q3 and shall not appear in any previously released data. First, the adversary links to Quarter 3 and learns that the record of John is within the second <italic>QID</italic> group (CaseIDs 7, 8, 18). The adversary can then connect to the 2 previously published SRS data sets through the candidate CaseID set of John {7, 8, 18}, observing 2 matching records whose CaseID are 7 and 8 in Quarter 2. The CaseID of John is neither 7 nor 8, so the adversary concludes that the CaseID of John is 18, ruining the privacy protection embedded by 3-anonymity.</p>
        </sec>
      </sec>
      <sec>
        <title>Background Knowledge and Related Work</title>
        <sec>
          <title>Privacy Models for Microdata Publishing</title>
          <p>Research on PPDP [<xref ref-type="bibr" rid="ref4">4</xref>] aims to protect released microdata from 2 types of privacy attacks: <italic>record disclosure</italic> and <italic>attribute disclosure</italic>.</p>
          <p>Record disclosure, also known as <italic>table linkage attack</italic>, refers to the situation in which the individual identity of a specific tuple that has been deidentified in the published data is reidentified. Although it is hard to prevent table linkage attacks, it is possible to reduce the possibility of identifying victims in a published data. Achievement is the invention of <italic>k</italic>-anonymity [<xref ref-type="bibr" rid="ref6">6</xref>], which is the most influential privacy model that generalizes the values of <italic>QID</italic> to ensure that each record in published data contains at least <italic>k</italic>–1 other records with the same <italic>QID</italic> value.</p>
          <p>Attribute disclosure, also known as <italic>attribute linkage attack</italic>, refers to the situation in which attackers can infer an individual’s sensitive information, even though they fail to perceive the exact record of the victim. Unfortunately, <italic>k</italic>-anonymity is not able to prevent attribute disclosure. Another renowned privacy model called <italic>l</italic>-diversity [<xref ref-type="bibr" rid="ref8">8</xref>] was thus proposed. The main idea of <italic>l</italic>-diversity is to thwart the adversary’s belief on the probability of the sensitive value by ensuring that each <italic>QID</italic> group contains at least <italic>l</italic> “well-represented” sensitive values, that is, the probability of inferring the sensitive value of the victim will be at most 1/<italic>l</italic>.</p>
        </sec>
        <sec>
          <title>Privacy Models for Incremental Data Publishing</title>
          <p>Most real-world data are not static but dynamically changing, which means that data cannot be published simultaneously but have to be published incrementally [<xref ref-type="bibr" rid="ref4">4</xref>]. Previously proposed privacy models such as <italic>k</italic>-anonymity and <italic>l</italic>-diversity only focus on single static data publishing, awkward to prevent privacy disclosure in incremental data publishing. Contemporary privacy models for incremental data publishing can be classified into <italic>continuous</italic> or <italic>dynamic</italic> data publishing [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        </sec>
        <sec>
          <title>Continuous Data Publishing</title>
          <p>This refers to the scenario in which all data collected so far have to be published even if some of the data have been released before. More precisely, suppose that the data holder had previously collected a set of records <italic>D</italic><sub>1</sub> time stamped <italic>t</italic><sub>1</sub> and published the anonymized version <italic>R</italic><sub>1</sub> of <italic>D</italic><sub>1</sub>. After collecting a new set of records <italic>D</italic><sub>2</sub> time stamped <italic>t</italic><sub>2</sub>, the data holder will publish <italic>R</italic><sub>2</sub> as an anonymized version of all records collected so far, (ie, <italic>D</italic><sub>1</sub> ∪ <italic>D</italic><sub>2</sub>). In general, the published release <italic>R<sub>i</sub></italic> (<italic>i</italic>≥1) shall be an anonymized version of <italic>D</italic><sub>1</sub> ∪ <italic>D</italic><sub>2</sub> ∪ ... <italic>D<sub>i</sub></italic>.</p>
          <p>Byun et al [<xref ref-type="bibr" rid="ref13">13</xref>] first identified the privacy threat under continuous data publishing. They demonstrated possible inference channels by comparing different <italic>l</italic>-diverse releases to explore the sensitive values of victims. They later enhanced their approach by considering both <italic>k</italic>-anonymity and <italic>l</italic>-diverse called (<italic>k</italic>, <italic>c</italic>)-anonymous and exploring more types of adversarial attacks named <italic>cross-version inference</italic>s [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
          <p>Pei et al [<xref ref-type="bibr" rid="ref15">15</xref>] illustrated that in the continuous data publishing scenario, the adversary can infer some privacy information from multiple releases that have been sanitized by <italic>k</italic>-anonymity. They also proposed an effective method called “monotonic incremental anonymization,” which would progressively and consistently reduce the generalization granularity as the updates arrive to maintain <italic>k</italic>-anonymity.</p>
          <p>Fung et al [<xref ref-type="bibr" rid="ref16">16</xref>] proposed a method to quantify the exact number of records that can be “cracked” by comparing the series of published <italic>k</italic>-anonymous data. The adversary can exclude the cracked records from published data, making the published data no longer satisfy <italic>k</italic>-anonymous. They also presented a privacy model, called <italic>BCF-</italic>anonymity, to measure the anonymous number in published data after excluding the cracked records, and proposed an algorithm to anonymize published data achieving <italic>BCF</italic>-anonymity.</p>
        </sec>
        <sec>
          <title>Dynamic Data Publishing</title>
          <p>This refers to the scenario in which the data holder can insert records into or delete records, or perform both actions, from raw data sets. Suppose that the data holder had collected an initial set of records <italic>D</italic><sub>1</sub> in time <italic>t</italic><sub>1</sub> and published its anonymized version <italic>R</italic><sub>1</sub>. During the period [<italic>t</italic><sub>1</sub>, <italic>t</italic><sub>2</sub>), the data holder kept collecting new records and inserted them into <italic>D</italic><sub>1</sub>. Further, the data holder might delete and update some records from <italic>D</italic><sub>1</sub>, finally obtaining the updated version <italic>D</italic><sub>2</sub> of <italic>D</italic><sub>1</sub> in <italic>t</italic><sub>2</sub>. Then, the published release <italic>R</italic><sub>2</sub> in <italic>t</italic><sub>2</sub> is an anonymized version of <italic>D</italic><sub>2</sub>. In general, a published release <italic>R<sub>i</sub></italic> in time <italic>t<sub>i</sub></italic> shall be an anonymized version of <italic>D<sub>i</sub></italic>.</p>
          <p>Xiao and Tao [<xref ref-type="bibr" rid="ref17">17</xref>] identified a kind of privacy disclosure called <italic>critical absence</italic>. The adversary can infer victims’ sensitive information by comparing the series of published <italic>l</italic>-diverse data in dynamic data publishing scenarios (only considered insertion and deletion). They proposed a privacy model, called <italic>m</italic>-invariance, to ensure the certain “invariance” of the “signature” of <italic>QID</italic> groups, and an effective method called counterfeited generalization to anonymize published data achieving <italic>m</italic>-invariance.</p>
          <p>Bu et al [<xref ref-type="bibr" rid="ref18">18</xref>] noticed that some sensitive values would be permanent, such as criminal record and some incurable diseases, such as HIV. They showed that <italic>m</italic>-invariance is unable to prevent privacy disclosure when permanent sensitive values are present. Therefore, they proposed an anonymization approach, called <italic>HD-</italic>composition [<xref ref-type="bibr" rid="ref18">18</xref>], to limit the probability of linkage between individuals and sensitive values not over a given threshold.</p>
          <p>On observing <italic>m</italic>-invariance only considers data evolution caused by insertion and deletion, Li and Zhou [<xref ref-type="bibr" rid="ref19">19</xref>] further presented a counterfeit generalization model named <italic>m</italic>-distinct to support full data evolution (ie, insertion, update, and deletion). Moreover, they observed that attribute updates are seldom arbitrary, with some correlations often existing between the old and the new values. Based on this observation, they assumed that all updates on sensitive values are nonarbitrary. Therefore, <italic>m</italic>-distinct applies the concept of the candidate update set, which is a set of specific sensitive values that can be updated.</p>
          <p>Following the work in [<xref ref-type="bibr" rid="ref19">19</xref>], Anjum et al [<xref ref-type="bibr" rid="ref20">20</xref>] further assumed that the updates in fully dynamic data publishing are arbitrary, meaning the old values of attributes may not correlate with the new values. They presented a new kind of attack named τ-attack by exploiting the “event list” of an individual. They also proposed a method called τ-safety, an extension of <italic>m</italic>-invariance, to solve the privacy disclosure caused by τ-attack.</p>
          <p>He et al [<xref ref-type="bibr" rid="ref21">21</xref>] presented a new type of attack named <italic>value equivalence attack</italic>, which can exploit the partitioned structure of published data, such as <italic>m</italic>-invariant releases, to obtain sensitive information of individuals. Once the adversary knows the actual sensitive value of an individual, he/she can disclose the sensitive information of the remaining individuals within the same equivalence class. They proposed a graph-based anonymization algorithm, which leverages a min-cut algorithm to prevent the old “value association attack” and the new “equivalence attack.”</p>
          <p>Specifically, Bewong et al [<xref ref-type="bibr" rid="ref22">22</xref>] focused on transactional data. They proposed a new privacy model called <italic>serially preserving</italic>, which requires the posterior probability of any sensitive term to its corresponding population rate bounded by a given threshold. A novel anonymization method (Sanony, which counts on adding counterfeits) was presented to guarantee a new published transactional data set satisfying the required privacy model.</p>
          <p>There is another scenario of nonstatic data publishing called <italic>sequential data publishing</italic>. Different vertical projections of the same table on different subsets of attributes are published consecutively in this scenario. Anonymization models and methods for this scenario were first studied in [<xref ref-type="bibr" rid="ref23">23</xref>] and then further investigated in [<xref ref-type="bibr" rid="ref24">24</xref>] and [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
          <p>In summary, no contemporary work notices the scenario of periodical data publishing, and no work has been conducted for SRS data anonymization, considering the privacy threat caused by follow-up cases. In this paper, we investigate the privacy threat caused by periodical releases of SRS data and propose anonymization methods to prevent the disclosure of personal privacy information while maintaining the utility of published data.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Publishing Scenario and Privacy Attacks</title>
        <p>We first introduce the periodical data publishing scenario and present 3 kinds of privacy attacks for periodically published SRS data sets satisfying MS(<italic>k</italic>, <italic>θ</italic>*)-bounding. We propose a new privacy model, PPMS(<italic>k</italic>, <italic>θ</italic>*)-bounding, to protect published SRS data sets from those attacks in the periodical data publishing scenario. We also propose a corresponding anonymization algorithm, namely PPMS-anonymization, that incorporates 2 innovative strategies, <italic>NC</italic>-bounding and <italic>QID</italic>-covering, to prevent the released data sets from privacy attacks caused by follow-up key (ie, CaseID). Two extensions of PPMS-anonymization, PPMS+-anonymization and PPMS++-anonymization, are presented as well, which employ more efficient techniques, including neglecting subsequent coverings and grouping with new cases.</p>
      </sec>
      <sec>
        <title>BFL-Attacks</title>
        <p>Typical SRS data, such as FAERS, are usually published periodically and contain follow-up cases, which can be expressed as a new data publishing model named periodical data publishing. Suppose that the data holder previously had collected an initial set of records <italic>D</italic><sub>1</sub> in period [<italic>t</italic><sub>0</sub>, <italic>t</italic><sub>1</sub>) and published <italic>R</italic><sub>1</sub> as an anonymized version of <italic>D</italic><sub>1</sub>. After collecting a new set of records <italic>D</italic><sub>2</sub> during period [<italic>t</italic><sub>1</sub>, <italic>t</italic><sub>2</sub>) the attacker wants to anonymize and publish <italic>D</italic><sub>2</sub> at time <italic>t</italic><sub>2</sub>. <italic>D</italic><sub>2</sub> may or may not contain some follow-up cases in <italic>D</italic><sub>1</sub>. Let <italic>R</italic><sub>2</sub> denote the anonymized version of <italic>D</italic><sub>2</sub>. In general, the release <italic>R<sub>i</sub></italic> published at <italic>t<sub>i</sub></italic> is an anonymized version of <italic>D<sub>i</sub></italic> (<italic>i</italic>≥1). Note that for an original case <italic>x</italic>, the life span of its follow-up cases in subsequent releases is not continuous. That is, a follow-up observed in <italic>D<sub>i</sub></italic> may disappear in <italic>D<sub>i</sub></italic><sub>+1</sub> but show up again in some later release <italic>D<sub>i</sub></italic><sub>+</sub><italic><sub>j</sub></italic>, for <italic>j</italic>&#62;1. This makes the periodical publishing scenario distinct from existing scenarios in the literature. First, unlike the situation in dynamic data publishing, <italic>D<sub>i</sub></italic> is a new set of collections, rather than updated from <italic>D<sub>i</sub></italic><sub>–1</sub>. Besides, the existence of follow-up cases is different from the assumption for continuous data publishing (ie, all cases in <italic>D<sub>i</sub></italic> should be kept in all subsequent releases <italic>D<sub>j</sub></italic>, for <italic>j&#62;i</italic>)<italic>.</italic> A comparison of the proposed periodical data publishing with dynamic data publishing and sequential data publishing is summarized in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (also see <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>).</p>
        <boxed-text id="box1" position="float">
          <title>Definition 1: QID-cover.</title>
          <p>Consider the <italic>QID</italic> values, <italic>q</italic><sub>1</sub> and <italic>q</italic><sub>2</sub>, of 2 cases. We say <italic>q</italic><sub>1</sub> covers <italic>q</italic><sub>2</sub>, denoted by <italic>q</italic><sub>1</sub> <inline-graphic xlink:href="medinform_v9i10e28752_fig14.png" xlink:type="simple" mimetype="image"/> <italic>q</italic><sub>2</sub>, if for every attribute <italic>a</italic> in <italic>QID</italic>, <italic>a</italic>(<italic>q</italic><sub>1</sub>) is equal to or more generalized than <italic>a</italic>(<italic>q</italic><sub>2</sub>), where <italic>a</italic>(<italic>q</italic>) denotes the value of <italic>q</italic> in attribute <italic>a</italic>.</p>
        </boxed-text>
        <sec>
          <title>Backward-Attack (B-Attack)</title>
          <p>Backward-Attack (<italic>B</italic>-attack) focuses on excluding records from the specific release by exploiting some previous ones (<xref ref-type="boxed-text" rid="box2">Textbox 2</xref>). Scenario I is an example, which occurs when the <italic>QID</italic> value of the old case differs from the background learned by the attacker. As the <italic>QID</italic> values would have been generalized in all published releases, the only way by which <italic>B</italic>-attack can succeed is when the <italic>QID</italic> value of old CaseID fails to cover that of the current CaseID. More precisely, for every target <italic>v</italic>, if in any previous release there exists an old CaseID <italic>i</italic><sub>old</sub> corresponding to the candidate CaseID set of <italic>v</italic> such that the <italic>QID</italic> value of <italic>i</italic><sub>old</sub> does not cover the <italic>QID</italic> value of <italic>v</italic>, then <italic>i</italic><sub>old</sub> would be excluded from the candidate CaseID set of <italic>v</italic>.</p>
          <boxed-text id="box2" position="float">
            <title>Definition 2: Backward-attack.</title>
            <p>Consider a target <italic>v</italic> to be inferred by the attacker and an anonymized release <italic>R<sub>i</sub></italic>. Let <italic>q<sup>v</sup></italic> and <italic>CI</italic> denote the <italic>QID</italic> value and the candidate CaseID set of <italic>v</italic> in <italic>R<sub>i</sub></italic>, respectively, and <italic>U</italic> be the set of records in all previous releases {<italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i</sub></italic><sub>–1</sub>} whose CaseID is in <italic>CI</italic>. The <italic>B</italic>-attack will occur if there exists a record <italic>r</italic> in <italic>U</italic> such that the <italic>QID</italic> value of <italic>r</italic>, <italic>q<sup>r</sup></italic>, does not cover <italic>q<sup>v</sup></italic>. The set of these excludable records is denoted by <italic>B</italic>.</p>
          </boxed-text>
        </sec>
        <sec>
          <title>Forward-Attack (F-Attack)</title>
          <p>Analogous to <italic>B</italic>-attack, Forward-Attack (<italic>F</italic>-attack) occurs when the <italic>QID</italic> value of the following CaseID differs from the background learned by the attacker (<xref ref-type="boxed-text" rid="box3">Textbox 3</xref>). That is, the <italic>QID</italic> value of a following CaseID in some subsequent releases fails to cover that of the current CaseID. An example is shown in Scenario II. More precisely, for every target <italic>v</italic>, if in any subsequent release there exists a following CaseID <italic>i</italic><sub>new</sub> corresponding to the candidate CaseID set of <italic>v</italic> such that the <italic>QID</italic> value of <italic>i</italic><sub>new</sub> does not cover the <italic>QID</italic> value of <italic>v</italic>, then <italic>i</italic><sub>new</sub> would be excluded from the candidate CaseID set of <italic>v</italic>.</p>
          <boxed-text id="box3" position="float">
            <title>Definition 3: Forward-attack.</title>
            <p>Consider a target <italic>v</italic> and an anonymized release <italic>R<sub>i</sub></italic>. Let <italic>q<sup>v</sup></italic> and <italic>CI</italic> denote the <italic>QID</italic> value and the candidate CaseID set of <italic>v</italic> in <italic>R<sub>i</sub></italic>, respectively, and <italic>U</italic> be the set of records in all subsequent releases {<italic>R<sub>i</sub></italic><sub>+1</sub>, <italic>R<sub>i</sub></italic><sub>+2</sub>, ..., <italic>R<sub>c</sub></italic>} whose CaseID is in <italic>CI</italic>. The <italic>F</italic>-attack will occur if there exists a record <italic>r</italic> in <italic>U</italic> such that the <italic>QID</italic> value of <italic>r</italic>, <italic>q<sup>r</sup></italic>, does not cover <italic>q<sup>v</sup></italic>. The set of these excludable records is denoted by <italic>F</italic>.</p>
          </boxed-text>
        </sec>
        <sec>
          <title>Latest-Attack (L-Attack)</title>
          <p>This attack is illustrated in Scenario III. In this example, the attacker knows that the event for the target (John) first appears in Quarter 3. It follows that John’s case (CaseID) is definitely absent in all previously published releases. In general, for every target <italic>v</italic> whose CaseID is first present in some release known by the attacker, <italic>Latest Attack</italic> (<italic>L</italic>-attack) would occur if the candidate CaseID set of <italic>v</italic> contains some old CaseIDs appearing in previous releases (<xref ref-type="boxed-text" rid="box4">Textbox 4</xref>).</p>
          <boxed-text id="box4" position="float">
            <title>Definition 4: Latest-attack.</title>
            <p>Consider a target <italic>v</italic>. Suppose the attacker learns that the CaseID of <italic>v</italic> first appears in an anonymized release <italic>R<sub>i</sub></italic>. Let <italic>CI</italic> be the candidate CaseID set of <italic>v</italic> in <italic>R<sub>i</sub></italic>. The <italic>L</italic>-attack will occur if there exists any case in <italic>CI</italic> whose CaseID appears in some previous releases. The set of these excludable records is denoted by <italic>L</italic>.</p>
          </boxed-text>
        </sec>
      </sec>
      <sec>
        <title>Privacy Model PPMS(k, θ*)-bounding</title>
        <p>To prevent <italic>BFL</italic>-attacks, we propose a new privacy model called periodical-publishing multisensitive (<italic>k</italic>, <italic>θ*</italic>)-bounding, abbreviated as PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding (<xref ref-type="boxed-text" rid="box5">Textboxes 5</xref> and <xref ref-type="boxed-text" rid="box6">6</xref>).</p>
        <boxed-text id="box5" position="float">
          <title>Definition 5: Confidence.</title>
          <p>Let <italic>s</italic> be a sensitive value in <italic>SA</italic> and an anonymized release <italic>R<sub>i</sub></italic>. Given a target <italic>v</italic> with <italic>QID</italic> value <italic>q<sup>v</sup></italic>, we define the probability that <italic>v</italic> has sensitive value <italic>s</italic> as <italic>conf</italic>(<italic>v</italic> → <italic>s</italic>), which is equal to <italic>σ<sub>s</sub></italic>(<italic>g</italic>)/&#124;<italic>g</italic>&#124;, where <italic>g</italic> denotes the <italic>QID</italic> group in <italic>R<sub>i</sub></italic> in which <italic>v</italic> resides and <italic>σ<sub>s</sub></italic>(<italic>g</italic>) is the number of cases in <italic>g</italic> that contains <italic>s</italic>.</p>
        </boxed-text>
        <boxed-text id="box6" position="float">
          <title>Definition 6: PPMS(<italic>k, θ</italic>*)-bounding.</title>
          <p>Let <italic>S</italic>={<italic>s</italic><sub>1</sub>, <italic>s</italic><sub>2</sub>, ..., <italic>s<sub>m</sub></italic>} be the set of all possible sensitive values in <italic>SA</italic> and <italic>θ*</italic>=(<italic>θ</italic><sub>1</sub>, <italic>θ</italic><sub>2</sub>, ..., <italic>θ<sub>m</sub></italic>) be the probability thresholds specified by the data holder, where 0≤<italic>θ<sub>j</sub></italic>≤1, for 1≤<italic>j</italic>≤<italic>m</italic>. We say a series of anonymized releases <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>n</sub></italic> satisfies PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding if each <italic>R<sub>i</sub></italic>, 1 ≤ <italic>i</italic> ≤ <italic>n</italic>, satisfies the following:</p>
          <p>1. For every individual <italic>v</italic>, the size of the candidate CaseID set <italic>CI</italic> of <italic>v</italic> in <italic>R<sub>i</sub></italic> excluding <italic>B</italic>, <italic>F</italic>, and <italic>L</italic> is no less than <italic>k</italic>, that is, &#124;<italic>CI</italic> – (<italic>B</italic>∪<italic>F</italic>∪<italic>L</italic>)&#124; ≥ <italic>k</italic>, and</p>
          <p>2. The confidence to infer <italic>v</italic> having any sensitive value <italic>s<sub>j</sub></italic> ∈ <italic>S</italic> is no larger than <italic>θ<sub>j</sub></italic>, that is, <italic>conf</italic>(<italic>v</italic> → <italic>s<sub>j</sub></italic>) ≤ <italic>θ<sub>j</sub></italic>.</p>
          <p>The privacy requirement of Definition 6(1) is to prevent record disclosure while Definition 6(2) is to prevent attribute disclosure. Our model adopts nonuniform thresholds for different sensitive values because different values express different degrees of sensitivity in the real world. For example, the disclosure of a patient with fever is far less sensitive than that of an individual with HIV.</p>
        </boxed-text>
      </sec>
      <sec>
        <title>Anonymization Algorithm</title>
        <sec>
          <title>Overview</title>
          <p>Our algorithm can be summarized as a greedy and clustering approach to divide records into <italic>QID</italic> groups. Viewing each <italic>QID</italic> group as a cluster, we adopted a clustering-based method [<xref ref-type="bibr" rid="ref26">26</xref>] to build <italic>QID</italic> groups, each of which starts from a randomly chosen record and grows gradually by adding a solo record exhibiting the best characteristic among all candidates. This process repeats until the <italic>QID</italic> group satisfies the “<italic>k</italic>” requirement. Finally, we generalize the <italic>QID</italic> values of all records within the same cluster to the same value.</p>
          <p>We adopted 2 metrics, information loss [<xref ref-type="bibr" rid="ref26">26</xref>] (<xref ref-type="boxed-text" rid="box7">Textbox 7</xref>) and privacy risk (PR) [<xref ref-type="bibr" rid="ref9">9</xref>] (<xref ref-type="boxed-text" rid="box8">Textbox 8</xref>), to choose the best isolated record. For each evolving <italic>QID</italic> group, the former favors the new record contributing minimal impact to the data utility while the latter quantifies the ratio of sensitive values within the <italic>QID</italic> group to meet the privacy requirement in Definition 6(2).</p>
          <boxed-text id="box7" position="float">
            <title>Definition 7: Information loss.</title>
            <p>Suppose the <italic>QID</italic> attributes can be separated to 2 different sets, numerical attributes {<italic>N</italic><sub>1</sub>, <italic>N</italic><sub>2</sub>, ..., <italic>N<sub>m</sub></italic>} and categorical attributes {<italic>C</italic><sub>1</sub>, <italic>C</italic><sub>2</sub>, ..., <italic>C<sub>n</sub></italic>}, and each <italic>C<sub>i</sub></italic> is associated with a taxonomy tree <italic>T<sub>i</sub></italic>. Let <italic>g</italic> denote a <italic>QID</italic> group (or cluster). The <italic>information loss</italic> (<italic>IL</italic>) [<xref ref-type="bibr" rid="ref26">26</xref>] of <italic>g</italic> is defined as follows:</p>
            <disp-formula>
              <graphic xlink:href="medinform_v9i10e28752_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>where max(<italic>N<sub>i</sub></italic>) and min(<italic>N<sub>i</sub></italic>) denote the maximum and minimum values of attribute <italic>N<sub>i</sub></italic> in the whole data set, and max(<italic>N<sub>i</sub></italic>, <italic>g</italic>) and min(<italic>N<sub>i</sub></italic>, <italic>g</italic>) denote the maximum and minimum values of attribute <italic>N<sub>i</sub></italic> in <italic>g</italic>. Notation &#124;<italic>g</italic>&#124; is the number of records in <italic>g</italic>, <italic>h</italic>(<italic>C<sub>j</sub></italic>) the height of the taxonomy tree <italic>T<sub>j</sub></italic>, and <italic>h</italic>(<italic>C<sub>j</sub></italic>, <italic>g</italic>) is the height of the generalized value of <italic>C<sub>j</sub></italic> in <italic>g</italic> in taxonomy tree <italic>T<sub>j</sub></italic>.</p>
            <p>To find a new record <italic>r</italic> to be included in <italic>g</italic>, we choose the one causing the least increase of information loss, which is measured by</p>
            <disp-formula>Δ<italic>IL</italic>(<italic>g</italic>, <italic>r</italic>)=<italic>IL</italic>(<italic>g</italic> ∪ {<italic>r</italic>}) – <italic>IL</italic>(<italic>g</italic>) <bold>(2)</bold></disp-formula>
            <p>Then, the most feasible choice <italic>r<sub>bst</sub></italic> is</p>
            <disp-formula><italic>r<sub>bst</sub></italic>=argmin<italic><sub>r</sub></italic> Δ<italic>IL</italic>(<italic>g</italic>, <italic>r</italic>) <bold>(3)</bold></disp-formula>
            <p>In addition, the inclusion of record <italic>r</italic> containing sensitive value <italic>s</italic> that appears in <italic>g</italic> would cause the ratio of <italic>s</italic> in <italic>g</italic> to be over <italic>θ<sub>s</sub></italic>. As we will derive in Lemma 2, we have to keep the occurrence of <italic>s</italic> in <italic>g</italic>, denoted by <italic>σ<sub>s</sub></italic>(<italic>g</italic>), under a maximum threshold<italic> η<sub>s</sub></italic>(<italic>g</italic>) to prevent the confidence of inferring sensitive value <italic>s</italic> in <italic>g</italic> from being larger than <italic>θ<sub>s</sub></italic>. We thus adopt the <italic>PR<sub>s</sub></italic> introduced in [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
            <disp-formula>
              <graphic xlink:href="medinform_v9i10e28752_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>When <italic>η<sub>s</sub></italic>(<italic>g</italic>∪{<italic>r</italic>}) ≥ <italic>σ<sub>s</sub></italic>(<italic>g</italic>∪{<italic>r</italic>}), a greater <italic>σ<sub>s</sub></italic> leads to a larger <italic>PR</italic><sub>s</sub>. Therefore, Equation 4 favors the new record <italic>r</italic> whose sensitive values are relatively rare in <italic>g</italic>. Because a record may contain more than 1 sensitive value, the PR caused by adding <italic>r</italic> into <italic>g</italic> can be defined as the summation of <italic>PR<sub>s</sub></italic> over all sensitive values.</p>
          </boxed-text>
          <boxed-text id="box8" position="float">
            <title>Definition 8: Privacy risk.</title>
            <p>Let <italic>g</italic> denote a <italic>QID</italic> group (or cluster) during the execution of our anonymization algorithm. The PR [<xref ref-type="bibr" rid="ref9">9</xref>] of adding a new record <italic>r</italic> into <italic>g</italic> is</p>
            <disp-formula>
              <graphic xlink:href="medinform_v9i10e28752_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>where <italic>s</italic> ∈ <italic>S<sub>r</sub></italic> and <italic>S<sub>r</sub></italic> is the set of sensitive values contained in record <italic>r</italic>.</p>
            <p>The value of summation of <italic>PR<sub>s</sub></italic> may be zero, that is, all sensitive values in <italic>r</italic> are new to group <italic>g</italic>. An increment is thus added into PR(<italic>g</italic>, <italic>r</italic>) in Equation 5 to avoid zero PR. The smaller the PR caused by adding <italic>r</italic> into <italic>g</italic>, the more likely <italic>r</italic> will be chosen. If the inclusion of <italic>r</italic> makes the number of records containing <italic>s</italic> in <italic>g</italic> more than the maximally allowed number, PR becomes infinite, so <italic>r</italic> will not be chosen. Finally, we refine Δ<italic>IL</italic> into Δ<italic>IL'</italic> as follows</p>
            <disp-formula>Δ<italic>IL</italic>ʹ(<italic>g</italic>, <italic>r</italic>)=Δ<italic>IL</italic>(<italic>g</italic>, <italic>r</italic>) × PR(<italic>g</italic>, <italic>r</italic>) <bold>(6)</bold></disp-formula>
            <p>and the most feasible choice <italic>r<sub>bst</sub></italic> is</p>
            <disp-formula><italic>r<sub>bst</sub></italic>=argmin<italic><sub>r</sub></italic> Δ<italic>IL'</italic>(<italic>g</italic>, <italic>r</italic>) <bold>(7)</bold></disp-formula>
          </boxed-text>
        </sec>
        <sec>
          <title>Strategies Against BFL-Attacks</title>
          <p>The <italic>NC</italic>-bounding strategy aims to maintain at least “<italic>k</italic>” new CaseID records in each group after excluding all old CaseID records. This is because all old CaseID records may become excludable by exploiting the previous releases, such as <italic>B</italic>-attack and <italic>L</italic>-attack. <italic>QID-</italic>covering is to generalize the <italic>QID</italic> value of records to prevent them from being excluded by <italic>B</italic>-attack and <italic>F</italic>-attack. <italic>NC</italic>-bounding allows the adversary to discover and exclude records not belonging to the target, but enforces the privacy requirement met by the remaining records. <italic>QID-</italic>covering, by contrast, perplexes the adversary to find out excludable records.</p>
        </sec>
        <sec>
          <title>Strategy for L-Attack</title>
          <sec>
            <title>Overview</title>
            <p>Recall that <italic>L</italic>-attack occurs as the adversary knows the exact published release to which the first ADE of the target <italic>v</italic> belongs. Specifically, let this release be <italic>R<sub>i</sub></italic>. All old CaseIDs in target <italic>v</italic>’s <italic>CI</italic> set in <italic>R<sub>i</sub></italic> refer to other targets, which are potentially excluded by the attacker and so should be discounted from forming a valid <italic>QID</italic> group, that is, the size of the <italic>QID</italic> group should be at least <italic>k</italic>. For this reason, we use strategy <italic>NC</italic>-bounding.</p>
          </sec>
          <sec>
            <title>Example 1</title>
            <p>Consider the example in Scenario III. The target <italic>QID</italic> group &#60;Male, [30-35]&#62; in <xref ref-type="table" rid="table1">Table 1</xref> (quarter 3) contains 2 old CaseIDs (ie, 7 and 8). We need to add 2 other records with new CaseIDs to make <xref ref-type="table" rid="table1">Table 1</xref> (quarter 3) invulnerable to <italic>L</italic>-attack. In this case, all records in the <italic>QID</italic> group &#60;Female, [30-35]&#62; are new cases and the size of &#60;Female, [30-35]&#62; is larger than <italic>k</italic> + 2. We can choose any 2 of them (eg, 16 and 17) into &#60;Male, [30-35]&#62; and generalize the <italic>QID</italic> values accordingly. In general, to defend against <italic>L</italic>-attack, the number of new CaseID records in every <italic>QID</italic> group needs to be no less than <italic>k</italic>.</p>
          </sec>
        </sec>
        <sec>
          <title>Strategy for B-Attack</title>
          <sec>
            <title>Overview</title>
            <p>Suppose the target <italic>v</italic> is in <italic>R<sub>i</sub></italic>. <italic>B</italic>-attack means the adversary can link to <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub> through the candidate CaseID set of <italic>v</italic> to exclude those CaseIDs definitely not belonging to target <italic>v</italic>. Note that all of the excludable CaseIDs in <italic>B</italic>-attack are old CaseIDs; thus, the situation is the same as <italic>L</italic>-attack in which all of the old CaseID records have a probability to be excluded. Therefore, the <italic>NC</italic>-bounding strategy used to defend <italic>L</italic>-attack can also be used to secure against <italic>B</italic>-attack. That is, the number of new CaseID records in every <italic>QID</italic> group needs to be larger than or equal to <italic>k</italic> in PPMS(<italic>k, θ*</italic>)-bounding. In this sense, <italic>L</italic>-attack is similar to <italic>B</italic>-attack, because both of them exploit the previous releases to find excludable CaseIDs. The main difference is that the former needs to know whether the CaseID is old or not, while the latter needs to compare the <italic>QID</italic> values to infer whether the CaseID belongs to the target.</p>
          </sec>
          <sec>
            <title>Example 2</title>
            <p>Consider the example in Scenario I. Similar to the previous example for <italic>L</italic>-attack, we have to include 2 records with new CaseIDs, say 8 and 9, into the <italic>QID</italic> group containing old CaseIDs 1 and 4 in <xref ref-type="table" rid="table1">Table 1</xref> (quarter 2), that is, &#60;ANY, [30-40]&#62;, and perform generalization accordingly. <xref ref-type="table" rid="table2">Table 2</xref> (quarter 2) shows the resulting anonymized table.</p>
            <table-wrap position="float" id="table2">
              <label>Table 2</label>
              <caption>
                <p>The anonymized releases against <italic>BFL</italic>-attack for the example in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
              </caption>
              <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
                <col width="30"/>
                <col width="260"/>
                <col width="210"/>
                <col width="210"/>
                <col width="290"/>
                <thead>
                  <tr valign="top">
                    <td colspan="2">Quarter and CaseID</td>
                    <td>Sex</td>
                    <td>Age</td>
                    <td>Disease</td>
                  </tr>
                </thead>
                <tbody>
                  <tr valign="top">
                    <td colspan="2">
                      <bold>2</bold>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>13</td>
                    <td>Female</td>
                    <td>[30-35]</td>
                    <td>Flu</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>14</td>
                    <td>Female</td>
                    <td>[30-35]</td>
                    <td>Diabetes</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>15</td>
                    <td>Female</td>
                    <td>[30-35]</td>
                    <td>Fever</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>16</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Flu</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>17</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Fever</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>7</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Diabetes</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>8</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Fever</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>18</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>HIV</td>
                  </tr>
                  <tr valign="top">
                    <td colspan="2">
                      <bold>3</bold>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>1</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Flu</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>4</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>HIV</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>7</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Diabetes</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>8</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Fever</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>9</td>
                    <td>ANY</td>
                    <td>[30-40]</td>
                    <td>Flu</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>10</td>
                    <td>Male</td>
                    <td>[30-35]</td>
                    <td>Diabetes</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>11</td>
                    <td>Male</td>
                    <td>[30-35]</td>
                    <td>HIV</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>12</td>
                    <td>Male</td>
                    <td>[30-35]</td>
                    <td>Flu</td>
                  </tr>
                </tbody>
              </table>
            </table-wrap>
          </sec>
        </sec>
        <sec>
          <title>Strategy for F-Attack</title>
          <sec>
            <title>Overview</title>
            <p>Suppose the target is in <italic>R<sub>i</sub></italic>. <italic>F</italic>-attack means that the adversary can link to {<italic>R<sub>i</sub></italic><sub>+1</sub>, <italic>R<sub>i</sub></italic><sub>+2</sub>, ..., <italic>R<sub>n</sub></italic>} through the candidate CaseID set of target and exclude the CaseIDs that are definitely not referring to the target. Unlike <italic>BL</italic>-attacks, <italic>F</italic>-attack exploits the subsequent releases. The <italic>NC</italic>-bounding strategy works for <italic>BL</italic>-attacks because we can find out which CaseIDs are excludable in the latest raw data set by using previous releases. Unfortunately, because <italic>R<sub>i</sub></italic><sub>+1</sub>, <italic>R<sub>i</sub></italic><sub>+2</sub>, ..., <italic>R<sub>n</sub></italic> is not published yet, there is no way to foresee which CaseIDs will be excluded in <italic>R<sub>i</sub></italic> by employing <italic>F</italic>-attack, causing the <italic>NC</italic>-bounding strategy to be infeasible to defend <italic>F</italic>-attack. By contrast, we know that the adversary can exploit <italic>R<sub>i</sub></italic> to perform <italic>F</italic>-attack to exclude records in <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub>. Therefore, the focus is to protect <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub> from <italic>F</italic>-attack through utilizing <italic>R<sub>i</sub></italic>. In other words, we have to consider how to anonymize <italic>D<sub>i</sub></italic> to <italic>R<sub>i</sub></italic>, making <italic>R<sub>i</sub></italic> non-exploitable for performing <italic>F</italic>-attack on <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub>. By applying the same strategy to all subsequent releases after <italic>R<sub>i</sub></italic>, that is, <italic>R<sub>i+</sub></italic><sub>1</sub>, <italic>R<sub>i+</sub></italic><sub>2</sub>, ..., <italic>R<sub>n</sub></italic>, we protect <italic>R<sub>i</sub></italic> from <italic>F</italic>-attack.</p>
            <p>Let <italic>OC<sub>i</sub></italic> be the set of old CaseIDs present in at least one of the previous releases <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub>. Consider a record <italic>r</italic> whose CaseID is in <italic>OC<sub>i</sub></italic>. Let <italic>O</italic>={<italic>r</italic><sub>1</sub>, <italic>r</italic><sub>2</sub>, ..., <italic>r<sub>p</sub></italic>} refer to, as in previous releases <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub>, the set of records that has the same CaseID as that of <italic>r</italic>. To prevent <italic>F</italic>-attack, we have to ensure that</p>
            <disp-formula>∀<italic>a</italic> ∈ <italic>QID</italic>, <italic>a</italic>(<italic>r</italic>) <inline-graphic xlink:href="medinform_v9i10e28752_fig14.png" xlink:type="simple" mimetype="image"/> <italic>a</italic>(<italic>r<sub>i</sub></italic>), for 1 ≥ <italic>i</italic> ≥ <italic>p</italic>.</disp-formula>
            <p>That is, the <italic>QID</italic> value of <italic>r</italic> should cover that of all <italic>r</italic>’s previous cases.</p>
          </sec>
          <sec>
            <title>Example 3</title>
            <p>Consider the example in Scenario II. To prevent the table published in Quarter 2 from <italic>F</italic>-attack, we have to generalize the 2 records, 7 and 8, in Quarter 3 to cover their corresponding predecessors in <xref ref-type="table" rid="table1">Table 1</xref> (quarter 2). This causes the <italic>QID</italic> value of case 7 to become “ANY, [30-40]” and that of case 8 remains unchanged. Because 7, 8, and 18 are in the same <italic>QID</italic> group, we have to generalize their <italic>QID</italic> values into the same value, that is, “ANY, [30-40]”. Finally, if <italic>L</italic>-attack is considered as well, as demonstrated in Example 1, we have to include cases 16 and 17 and finally obtain the result in <xref ref-type="table" rid="table2">Table 2</xref> (quarter 2).</p>
          </sec>
          <sec>
            <title>Lemma 1 (Covering Transitivity)</title>
            <p>Consider any 3 records, <italic>r</italic><sub>1</sub>, <italic>r</italic><sub>2</sub>, and <italic>r</italic><sub>3</sub>, with the same CaseID in 3 anonymous releases <italic>R<sub>i</sub></italic>, <italic>R<sub>j</sub></italic>, and <italic>R<sub>k</sub></italic>, <italic>i</italic>&#60;<italic>j</italic>&#60;<italic>k</italic>. If <italic>q<sup>r</sup></italic><sup>1</sup><inline-graphic xlink:href="medinform_v9i10e28752_fig15.png" xlink:type="simple" mimetype="image"/><italic>q<sup>r</sup></italic><sup>2</sup> and <italic>q<sup>r</sup></italic><sup>2</sup><inline-graphic xlink:href="medinform_v9i10e28752_fig15.png" xlink:type="simple" mimetype="image"/><italic>q<sup>r</sup></italic><sup>3</sup>, then <italic>q<sup>r</sup></italic><sup>1</sup><inline-graphic xlink:href="medinform_v9i10e28752_fig15.png" xlink:type="simple" mimetype="image"/><italic>q<sup>r</sup></italic><sup>3</sup>.</p>
            <p>Lemma 1 suggests an efficient approach for realizing <italic>QID</italic> covering against <italic>F</italic>-attack. When we are anonymizing <italic>D<sub>i</sub></italic> to <italic>R<sub>i</sub></italic>, rather than checking all of the old CaseID records in the previous releases, {<italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub>}, we only have to search for, starting from <italic>R<sub>i–</sub></italic><sub>1</sub> to <italic>R</italic><sub>1</sub>, the latest release containing old CaseID records. Once we find that release, we can stop checking the remaining ones.</p>
            <p>We next summarize how we can integrate these 2 strategies to meet the privacy requirement in Definition 6(a).</p>
          </sec>
          <sec>
            <title>Theorem 1</title>
            <p>A release <italic>R<sub>i</sub></italic> anonymized by following strategies of <italic>NC</italic>-bounding and <italic>QID</italic> covering satisfies the requirement of Definition 6(a). For proof, please see <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
          </sec>
        </sec>
        <sec>
          <title>Strategy Against Attribute Disclosure</title>
          <sec>
            <title>Overview</title>
            <p>The privacy disclosure caused by <italic>BFL</italic>-attacks not only includes record disclosure but also attribute disclosure. This is illustrated with the following example.</p>
          </sec>
          <sec>
            <title>Example 4</title>
            <p>Consider the 3 consecutive quarters of the 3-anonymous release in <xref ref-type="table" rid="table1">Table 1</xref>. Recall that in Scenario I the adversary can link to <xref ref-type="table" rid="table1">Table 1</xref> (quarter 3) through the <italic>QID</italic> value of Alice {Female, 32} and perceive the <italic>CI</italic> of Alice is {1, 4, 7}, inferring the probability of Alice having any of {Flu, HIV, Diabetes} is 1/3. After employing <italic>B</italic>-attack via Quarter 1, <italic>CI</italic> is reduced to {4, 7}, so the adversary’s confidence that Alice has HIV or diabetes increases to 1/2. He/she can further exclude CaseID 7 from <italic>CI</italic> by performing <italic>F</italic>-attack via Quarter 3 and be 100% sure that Alice has HIV.</p>
            <p>Now let us consider how to prevent the attribute disclosure caused by <italic>BFL</italic>-attacks. The basic idea is to control the ratio of sensitive values in each <italic>QID</italic> group to be no greater than the specified threshold. Consider our proposed strategies against <italic>BFL</italic>-attacks stated in the previous section. Let <italic>S<sub>g</sub></italic>={<italic>s</italic><sub>1</sub>, <italic>s</italic><sub>2</sub>, ..., <italic>s<sub>p</sub></italic>} denote the set of sensitive values in <italic>g</italic> and (<italic>θ</italic><sub>1</sub>, <italic>θ</italic><sub>2</sub>, ..., <italic>θ<sub>p</sub></italic>) the corresponding threshold specified for <italic>S<sub>g</sub></italic>. We can derive the following occurrence bound for each sensitive value within a <italic>QID</italic> group <italic>g</italic> to meet the required threshold.</p>
          </sec>
          <sec>
            <title>Lemma 2</title>
            <p>For any sensitive value <italic>s</italic>∈<italic>S<sub>g</sub></italic>, the maximal number of cases in <italic>g</italic> that contains <italic>s</italic> without breaking the associated threshold <italic>θ<sub>s</sub></italic>, denoted by <italic>η<sub>s</sub></italic>(<italic>g</italic>), is</p>
            <disp-formula>
              <graphic xlink:href="medinform_v9i10e28752_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>where &#124;<italic>NC</italic>(<italic>g</italic>)&#124; is the number of new CaseIDs in <italic>g</italic>. For proof, please see <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
          </sec>
        </sec>
        <sec>
          <title>Algorithm PPMS-Anonymization</title>
          <p><xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> presents our algorithm PPMS-Anonymization, which is composed of 3 stages. The first stage aims at finding out old CaseID records and generalizing their <italic>QID</italic> values in advance to achieve <italic>QID-</italic>covering against <italic>F</italic>-attack. Because there may exist multiple individual records [<xref ref-type="bibr" rid="ref9">9</xref>] in ADE data sets, we follow the <italic>combined record</italic> (or <italic>super record</italic>) concept in [<xref ref-type="bibr" rid="ref9">9</xref>] to deal with this issue. All records with the same CaseID are combined into a super record before starting to form <italic>QID</italic> groups. Without this process, the records with identical CaseIDs may be divided into different <italic>QID</italic> groups, leading to more substantial deviation in the data quality and perplexing the process of identifying duplicate records while detecting ADR signals.</p>
          <p>To find out old CaseID records in <italic>D<sub>i</sub></italic> and generalize their <italic>QID</italic> values in advance, we check previous releases <italic>R<sub>pre</sub></italic> from <italic>R<sub>i–</sub></italic><sub>1</sub> to <italic>R<sub>i–x</sub></italic> (if <italic>i</italic>=1, <italic>R<sub>pre</sub></italic>=null). Because CaseID is used to trace an event’s follow-ups, there is typically a life span of CaseID, denoted by <italic>x.</italic> The generalization of old CaseID records aims at achieving <italic>QID-</italic>covering against <italic>F</italic>-attack. Because of the transitive property of <italic>QID</italic> value shown in Lemma 1, once we discover an old CaseID record <italic>r'</italic> in any one of the previous releases, we stop checking the remaining earlier releases by using “break” (line 13 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>) to end the “while loop” (line 8 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
          <p>The second stage shown in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> is activated by calling the procedure <italic>Grouping</italic>, forming as many <italic>QID</italic> groups satisfying PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding as possible. Each group begins with a randomly chosen seed record, gradually growing by adding a record with the least Δ<italic>IL'</italic> (defined in Equation 7) until there are at least <italic>k</italic> new CaseID records to achieve the <italic>NC</italic>-bounding strategy. The <italic>OldCaseNum</italic> function returns the number of old CaseID records in a group. A new group then begins with the new record most distinguished from the one just added into the latest group. The above steps are repeated until the remaining records fail to form a group, for example, the number of new CaseID records is less than <italic>k</italic> or the ratio of all sensitive values within the remaining records is higher than the associated threshold (see line 10 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p>
          <p>The last stage is activated by calling the function <italic>Generalization</italic> (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>), which processes the remaining ungrouped records by assigning each of them into the most feasible group that produces the minimal Δ<italic>IL'</italic> to sustain the data utility and satisfy the privacy requirement. Next, the super records will be split back to the original records (the group they belong to remains unchanged). Finally, all records within the same group are generalized into the same <italic>QID</italic> value to satisfy PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding.</p>
        </sec>
        <sec>
          <title>Algorithm PPMS<sup>+</sup>-Anonymization</title>
          <p>In this section, we propose an improvement of our PPMS-Anonymization algorithm: PPMS<sup>+</sup>-Anonymization. The idea is to neglect the <italic>QID</italic> covering derived in Lemma 1.</p>
          <p>Let <italic>r</italic> be a record in <italic>D<sub>i</sub></italic> whose CaseID is <italic>c</italic>, <italic>q<sup>r</sup></italic> the <italic>QID</italic> value of <italic>r</italic>, and <italic>r</italic><sub>1</sub>, <italic>r</italic><sub>2</sub>, ..., <italic>r<sub>p</sub></italic> be the older versions of <italic>r</italic> in the previous releases <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub>. To prevent <italic>F</italic>-attack, we have to make <italic>q<sup>r</sup></italic> cover {<italic>q<sup>r</sup></italic><sup>1</sup>, <italic>q<sup>r</sup></italic><sup>2</sup>, ..., <italic>q<sup>rp</sup></italic>}. Although we have exploited the transitivity property in Lemma 1 to avoid checking out all of the old CaseID records in releases <italic>R</italic><sub>1</sub>, <italic>R</italic><sub>2</sub>, ..., <italic>R<sub>i–</sub></italic><sub>1</sub>, the <italic>QID</italic> value suffers from accumulated generalization. That is, the later the record <italic>r</italic> is published, the more information loss will be caused by generalization. Fortunately, we can limit the accumulated generalization by neglecting all subsequent <italic>QID</italic> coverings.</p>
          <p>The fact is that some of the records protected by <italic>QID-</italic>covering against <italic>F</italic>-attack still can be eliminated by <italic>BL</italic>-attacks. Following the previous discussion, let <italic>r</italic><sub>1</sub> be the earliest record with CaseID=<italic>c</italic>. Without loss of generality, assume <italic>r</italic><sub>1</sub> resides in <italic>R</italic><sub>1</sub>. Then clearly, <italic>c</italic> is a new case in <italic>R</italic><sub>1</sub>, that is, <italic>c</italic> ∈ <italic>NC</italic>(<italic>R</italic><sub>1</sub>), and will be an old case in all subsequent releases, that is, <italic>c</italic> ∈ <italic>OC</italic>(<italic>R<sub>j</sub></italic>), 2 ≤ <italic>j</italic> ≤ <italic>i–</italic>1. Remember that all old CaseIDs have the potential to be excluded by <italic>BL</italic>-attacks. So even if we make <italic>q<sup>r</sup></italic> cover {<italic>q<sup>r</sup></italic><sup>2</sup>, <italic>q<sup>r</sup></italic><sup>3</sup>, ..., <italic>q<sup>ri</sup></italic><sup>-1</sup>} to prevent {<italic>r</italic><sub>2</sub>, <italic>r</italic><sub>3</sub>, ..., <italic>r<sub>i</sub></italic><sub>-1</sub>} from being excluded by <italic>F</italic>-attack, they can still be excluded by <italic>BL</italic>-attack. This means that generalizing <italic>q<sup>r</sup></italic> to cover {<italic>q<sup>r</sup></italic><sup>2</sup>, <italic>q<sup>r</sup></italic><sup>3</sup>, ..., <italic>q<sup>ri</sup></italic><sup>-1</sup>} is useless. It suffices to generalize <italic>q<sup>r</sup></italic> to cover <italic>q<sup>r</sup></italic><sup>1</sup>. <xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates this concept.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Idea illustration of neglecting subsequent coverings.</p>
            </caption>
            <graphic xlink:href="medinform_v9i10e28752_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p><xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref> shows PPMS<sup>+</sup>-Anonymization, the improved version of PPMS-Anonymization in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> (lines 5-18). For the given record <italic>r</italic>, the modified version seeks <italic>R<sub>i–x</sub></italic> to <italic>R<sub>i–</sub></italic><sub>1</sub> to find the earliest release in which <italic>r</italic> occurs. Once we find out the earliest old CaseID record <italic>r'</italic>, we stop checking the remaining releases.</p>
        </sec>
        <sec>
          <title>Algorithm PPMS<sup>++</sup>-Anonymization</title>
          <sec>
            <title>Overview</title>
            <p>In <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>, the procedure Grouping works by picking and adding the record with the least Δ<italic>IL'</italic> into the group, overlooking whether the record is a new or an old case in <italic>D'</italic>. We observed that this mixture of new and old cases to form a <italic>QID</italic> group would paralyze the discrimination of Δ<italic>IL</italic> in choosing good candidate records, that is, Equation 7, and cause severe information loss.</p>
            <p>Suppose an old CaseID record <italic>r</italic> is picked as the seed to start a new <italic>QID</italic> group <italic>g</italic> in the procedure Grouping. As an old case, the <italic>QID</italic> value of <italic>r</italic> has already been generalized to cover its earliest clone record <italic>r'</italic> in some previous release, meaning that <italic>q<sup>r</sup></italic> is as coarser as the group in which <italic>r'</italic> resides. Therefore, if there exist some isolated records whose <italic>QID</italic> values are covered by <italic>q<sup>r</sup></italic>, then adding these records into <italic>g</italic> yields no increase in information loss (ie, Δ<italic>IL</italic>=0)<italic>.</italic> Although this does not affect the information loss of group <italic>g</italic>, it does increase the information loss of the selected record. And in this situation, the Grouping procedure will randomly choose one from those isolated records, disregarding different degrees of information loss brought to these isolated records.</p>
          </sec>
          <sec>
            <title>Example 5</title>
            <p>Consider <xref ref-type="table" rid="table3">Table 3</xref>. We assume the age attribute has been discretized following the taxonomy tree in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>. The first 3 records form a group starting with the old case record 1, while records 4, 5, and 6 are new cases. Adding any of the 3 isolated records into this group yields no change in the group information loss because all of their <italic>QID</italic> values are covered by record 1. This makes no distinction in choosing the isolated records, but record 6 is the best choice, which exhibits the least data distortion after <italic>QID</italic> generalization.</p>
            <table-wrap position="float" id="table3">
              <label>Table 3</label>
              <caption>
                <p>An illustration of the problem of <italic>QID</italic> grouping starting with an old case.</p>
              </caption>
              <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
                <col width="30"/>
                <col width="340"/>
                <col width="210"/>
                <col width="210"/>
                <col width="210"/>
                <thead>
                  <tr valign="top">
                    <td colspan="2"><italic>QID</italic> group and isolated records</td>
                    <td>Sex</td>
                    <td>Age</td>
                    <td>Disease</td>
                  </tr>
                </thead>
                <tbody>
                  <tr valign="top">
                    <td colspan="2">
                      <bold>A forming <italic>QID</italic> group</bold>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>CaseID 1</td>
                    <td>ANY</td>
                    <td>Nonadult</td>
                    <td>Flu</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>CaseID 2</td>
                    <td>ANY</td>
                    <td>Nonadult</td>
                    <td>Flu</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>CaseID 3</td>
                    <td>ANY</td>
                    <td>Nonadult</td>
                    <td>Fever</td>
                  </tr>
                  <tr valign="top">
                    <td colspan="2">
                      <bold>Isolated records</bold>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                    <td>
                      <break/>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>CaseID 4</td>
                    <td>Female</td>
                    <td>Newborn</td>
                    <td>Fever</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>CaseID 5</td>
                    <td>Male</td>
                    <td>Preschool</td>
                    <td>Flu</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>CaseID 6</td>
                    <td>Female</td>
                    <td>Adolescent</td>
                    <td>Diabetes</td>
                  </tr>
                </tbody>
              </table>
            </table-wrap>
            <p>To solve this problem, we avoid mixing new CaseID and old CaseID records in forming <italic>QID</italic> groups. Instead, we separate old CaseID records from <italic>Dʹ</italic> before starting the procedure Grouping, forming possible <italic>QID</italic> groups composed of only new CaseID records. The set of old CaseID records and the remaining new CaseID records are later dealt with by the function Generalization. <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref> describes the modification of <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> to realize PPMS<sup>++</sup>-Anonymization, an improvement of PPMS<sup>+</sup>-Anonymization by grouping new cases first.</p>
          </sec>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>We designed a series of experiments to examine the effectiveness of our new method in anonymizing a series of periodically released SRS data sets. The proposed PPMS-Anonymization algorithm and its extensions, PPMS<sup>+</sup>-Anonymization and PPMS<sup>++</sup>-Anonymization, were compared with method MS-Anonymization. In this section, we describe the details of each experiment, including the experimental results and our observations.</p>
      </sec>
      <sec>
        <title>Experimental Setup</title>
        <p>The data used in our experiment consist of 32 quarterly collections from FAERS, including 2004Q1 to 2011Q4. We used attributes {<italic>Weight</italic>, <italic>Age</italic>, <italic>Gender</italic>} as <italic>QID</italic>, where <italic>Weight</italic> is numerical while the other 2 are categorical, with drug indication (<italic>INDI_PT</italic>) and drug reaction (<italic>PT</italic>) as <italic>SA</italic>. To view <italic>Age</italic> as categorical, we adopted the age taxonomy defined in MeSH [<xref ref-type="bibr" rid="ref27">27</xref>] (<xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>). Moreover, we discarded records that have missing values in either <italic>QID</italic> or <italic>SA</italic> attributes.</p>
        <p>We respectively performed MS-Anonymization [<xref ref-type="bibr" rid="ref9">9</xref>] and 3 versions of PPMS-Anonymization, including the original version of PPMS-Anonymization (PPMS), the improved version by incorporating neglecting subsequent coverings (PPMS<italic><sup>+</sup></italic>), and the advanced version by employing neglecting subsequent coverings and grouping with new cases (PPMS<sup>++</sup>), to anonymize the selected FAERS data sets, and computed the information loss of 2 series of anonymized data sets. We then imitated the behavior of the adversary, employing <italic>BFL</italic>-attacks to find out all excludable CaseIDs in 2 series of anonymized data sets. After that, we removed all excludable records, and evaluated the risk of record and attribute disclosure of 2 series of anonymized data sets.</p>
        <p>We examined 2 aspects of anonymized data sets: information loss and PR. The information loss of an anonymized data set is measured by <italic>normalized information loss</italic> (<italic>NIL</italic>), meaning the average <italic>IL</italic> (using Equation 1) for each attribute of each record.</p>
        <disp-formula>
          <graphic xlink:href="medinform_v9i10e28752_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>where <italic>R</italic> is an anonymized data set, <italic>g</italic> is a <italic>QID</italic>-group, <italic>GroupNum</italic>(<italic>R</italic>) denotes the number of <italic>QID</italic> groups in <italic>R</italic>, and &#124;<italic>QID</italic>&#124; is the number of attributes in <italic>QID</italic>. This yields <italic>NIL</italic> ranging in [0-1]; the larger the <italic>NIL</italic> is, the more serious is the information loss.</p>
        <p>We also used the 2 criteria in [<xref ref-type="bibr" rid="ref9">9</xref>] to measure the privacy disclosure, <italic>dangerous identity ratio</italic> (<italic>DIR</italic>) and <italic>dangerous sensitivity ratio</italic> (<italic>DSR</italic>); the former measures the ratio of <italic>QID</italic> groups that violate the privacy requirement for protecting record identity, while the latter measures the ratio of <italic>QID</italic> groups that explore sensitive values.</p>
        <disp-formula><italic>DIR</italic>(<italic>R</italic>)=<italic>DIGNum</italic>(<italic>R</italic>)/<italic>GroupNum</italic>(<italic>R</italic>) <bold>(10)</bold></disp-formula>
        <disp-formula><italic>DSR</italic>(<italic>R</italic>)=<italic>DSGNum</italic>(<italic>R</italic>)/<italic>GroupNum</italic>(<italic>R</italic>) <bold>(11)</bold></disp-formula>
        <p>If the number of records in a <italic>QID</italic> group is less than the threshold <italic>k</italic>, we say this group is a <italic>dangerous identity group</italic> (<italic>DIG</italic>). <italic>DIGNum</italic>(<italic>R</italic>) denotes the number of <italic>DIG</italic>s in the anonymized data set <italic>R</italic>. A <italic>QID</italic> group is a <italic>dangerous sensitivity group</italic> (<italic>DSG</italic>) if it contains at least one unsafe sensitive value whose frequency is higher than the associated threshold. <italic>DSGNum</italic>(<italic>R</italic>) denotes the number of <italic>DSGs</italic> in <italic>R</italic>.</p>
        <p>To observe the influence of 2 anonymization methods on the strength of ADR signals, we chose from FDA MedWatch [<xref ref-type="bibr" rid="ref28">28</xref>] all significant ADR rules involving patient demographics such as age or gender conditions and causing withdrawal or warning of the drug. A detailed description of these ADR rules is presented in <xref ref-type="table" rid="table4">Table 4</xref>. We used the proportional reporting ratio (PRR) [<xref ref-type="bibr" rid="ref29">29</xref>] description (<xref ref-type="supplementary-material" rid="app10">Multimedia Appendix 10</xref>) to measure the strength of ADR signals, which is used by the UK Yellow Card database and UK Medicines and Healthcare products Regulatory Agency (MHRA).</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Selected adverse drug reaction rules from Food and Drug Administration MedWatch.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="430"/>
            <col width="170"/>
            <col width="120"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Drug name and adverse reaction</td>
                <td>Demographic condition</td>
                <td>Marked year</td>
                <td>Withdrawn or warning year</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>Avandia</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Myocardial infarction</p>
                    </list-item>
                    <list-item>
                      <p>Death</p>
                    </list-item>
                    <list-item>
                      <p>Cerebrovascular accident</p>
                    </list-item>
                  </list>
                </td>
                <td>Age&#62;18</td>
                <td>1999</td>
                <td>2010</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Tysabri</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Progressive multifocal leukoencephalopathy</p>
                    </list-item>
                  </list>
                </td>
                <td>Age&#62;18</td>
                <td>2004</td>
                <td>2005</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Zelnorm</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Cerebrovascular accident</p>
                    </list-item>
                  </list>
                </td>
                <td>Sex=Female</td>
                <td>2002</td>
                <td>2007</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Warfarin</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Myocardial infarction</p>
                    </list-item>
                  </list>
                </td>
                <td>Age&#62;60</td>
                <td>1940</td>
                <td>2014</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Revatio</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Death</p>
                    </list-item>
                  </list>
                </td>
                <td>Age&#62;18</td>
                <td>2008</td>
                <td>2014</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>We considered 3 ways of setting <italic>θ</italic>*. First, we applied a uniform setting on <italic>θ</italic>*, that is, all confidence thresholds of symptoms were set to the same value (0.2 or 0.4). Then, we used a frequency-based method to determine the threshold of each symptom, which is based on the following idea: The more frequently the symptom occurs, the less sensitive it is. For this purpose, we calculated the average count of symptoms <italic>m</italic> and the corresponding SD. Then we set the confidence thresholds of symptoms whose occurrence is less than <italic>m</italic> – SD, between <italic>m</italic> – SD and <italic>m</italic> + SD, and higher than <italic>m</italic> + SD to 0.2, 0.6, and 1, respectively. Last, we adopted a level-wise confidence setting, which is similar to the frequency setting but conforming to well-recognized medical sensitive terms. All symptoms were classified into 3 levels: high sensitive (<italic>θ</italic>=0.2), low sensitive (<italic>θ</italic>=0.4), and nonsensitive (<italic>θ</italic>=1.0). For this purpose, we followed the setting in [<xref ref-type="bibr" rid="ref9">9</xref>], choosing the group of symptoms related to AIDS: “Acquired immunodeficiency syndromes” in MedDRA (Medical Dictionary for Regulatory Activities) as high sensitive, 2 groups called “Coughing and associated symptoms” and “Allergies to foods, food additives, drugs and other chemicals” as nonsensitive, and those not belonging to the above groups as low sensitive.</p>
      </sec>
      <sec>
        <title>Results on Anonymization Quality</title>
        <p>This section will report the results on information loss and privacy disclosure of MS-Anonymization and our proposed 3 versions of PPMS-Anonymization under 3 different settings of <italic>θ*</italic>.</p>
        <sec>
          <title>Uniform Confidence Setting</title>
          <p>In this evaluation, we set a uniform threshold (<italic>θ*</italic>=0.2 and 0.4) to each symptom, that is, the sensitivity of each symptom is the same, and 2 settings of <italic>k</italic> (<italic>k</italic>=5, 10).</p>
          <sec>
            <title>Information Loss</title>
            <p>First, we evaluated the information loss. As per the results shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>A-D, the general trend is when <italic>θ</italic>* is lower, the information loss is higher. It is because more records with different sensitive values have to be grouped together to form a valid <italic>QID</italic> group, so more generalization has to be performed. Among the 3 versions of PPMS-Anonymization, PPMS<sup>++</sup> leads the rank, followed by PPMS<sup>+</sup> and PPMS, with average improvements of 51% and 59% for PPMS<sup>++</sup> over PPMS<sup>+</sup> and PPMS, respectively, as <italic>θ</italic>*=0.2 and <italic>k</italic>=5, and reaching 78% and 82% for <italic>θ</italic>*=0.4 and <italic>k</italic>=10. We noticed that as <italic>θ</italic>*=0.2, some anonymized data sets fail to meet the privacy requirement, that is, 2006Q1, 2006Q2, 2007Q1, and 2010Q3. A further inspection revealed that these data sets contain some highly frequent symptoms. For example, there are 20,467 cases (without missing values) in 2007Q1, and 3877 (18.94%) of them contain “Diabetes Mellitus Non-Insulin-Dependent”. All methods fail in this data set because the minimum bound of that symptom should be 21.00% (3877/18,462, where 18,462 is the number of new cases), so the privacy requirement of 20% cannot be satisfied. In the data set 2010Q3, there are 12,727/56,550 (22.51%) cases containing “Smoking Cessation Therapy,” so no method can meet the privacy requirement. (In 2006Q1 and 2006Q2, the symptom “Myocardial Infarction” is frequent.) In general, the uniform threshold setting is not suitable, especially when some sensitive values are persistent.</p>
            <fig id="figure2" position="float">
              <label>Figure 2</label>
              <caption>
                <p>Evaluation on information loss and privacy disclosure for Federal Drug Administration Adverse Event Reporting System (FAERS) data anonymized by different methods with uniform setting of <italic>θ*</italic>. DIR: dangerous identity ratio, DSG: dangerous sensitivity group, NIL: normalized information loss, PPMS: periodical-publishing multisensitive.</p>
              </caption>
              <graphic xlink:href="medinform_v9i10e28752_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
          <sec>
            <title>Record Disclosure</title>
            <p>Next, we compared the record disclosure caused by each method. The results are shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>E-H. MS-Anonymization exhibits serious record disclosure. The average <italic>DIR</italic>s for <italic>k</italic>=5 and 10 are 0.61 and 0.8, respectively, meaning over half of <italic>QID</italic> groups are <italic>DIG</italic>s. Besides, the <italic>DIR</italic> of MS-Anonymization increases as <italic>k</italic> is larger. This is because a larger <italic>k</italic> leads to less number of groups and so a higher ratio of groups containing old cases, increasing the risk of <italic>QID</italic> groups becoming dangerous. It is noteworthy that the <italic>DIR</italic>s of 3 versions of PPMS-Anonymization are all 0. The reason is that our method guarantees free of record disclosure and the <italic>DIR</italic> metric is not dependent on different settings of <italic>θ</italic>*.</p>
          </sec>
          <sec>
            <title>Attribute Disclosure</title>
            <p>Finally, we present the results on the <italic>DSR</italic> metric. The results are shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>I and J. MS-Anonymization yields very high <italic>DSRs</italic>, 0.6 on average, for lower <italic>θ</italic>* values (<italic>θ</italic>=0.2). This is because a lower <italic>θ</italic> is more likely to cause the number of symptoms close to its maximal allowed number in the <italic>QID</italic> groups, especially for high-frequent symptoms. Thus, the action of excluding records is more likely to cause the violation of <italic>θ</italic>* and so leads to relatively higher <italic>DSR</italic>s, such as 2006Q1, 2006Q2, 2007Q1, and 2010Q3. For example, the maximal symptom frequencies in 2006Q4 and 2010Q1 are only 8.1% and 9.1%, respectively, relatively smaller than <italic>θ*</italic>=0.2 or 0.4, so the <italic>DSRs</italic> of these 2 releases are relatively lower than other releases. This again demonstrates that the uniform threshold setting is not feasible. The setting of <italic>k</italic> also influences the <italic>DSRs</italic> yielded by MS-Anonymization. A larger <italic>k</italic> not only causes higher maximal allowed numbers of symptoms in <italic>QID</italic> groups but also reduces the change in the ratio of symptoms when some records are excluded. Compared with MS-Anonymization, all 3 versions of PPMS-Anonymization yield zero <italic>DSR</italic> value in all data sets, except 2006Q1, 2006Q2, and 2007Q, showing our method can protect data from attribute disclosure caused by <italic>BFL</italic>-attacks.</p>
          </sec>
        </sec>
        <sec>
          <title>Frequency-Based Confidence Setting</title>
          <p>Two different settings of <italic>k</italic> (5 or 10) are considered. The results on <italic>DIR</italic> are omitted because they are similar to those generated by uniform setting, that is, MS-Anonymization generates large <italic>DIR</italic>s while our PPMS-Anonymization yields zero <italic>DIR</italic>.</p>
          <sec>
            <title>Information Loss</title>
            <p>As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>A and B, the <italic>NIL</italic>s generated by each method are better than those under the uniform setting. It is not surprising because this more flexible setting easily allows the methods to choose the closer new record to be added during <italic>QID</italic> group construction. Similar to those observed for the uniform setting, PPMS<sup>++</sup> significantly outperforms PPMS<sup>+</sup> and PPMS, yielding <italic>NILs</italic> less than 0.05 for <italic>k</italic> =5 and 0.15 for <italic>k</italic> =10.</p>
            <fig id="figure3" position="float">
              <label>Figure 3</label>
              <caption>
                <p>Evaluation on information loss and privacy disclosure for Federal Drug Administration Adverse Event Reporting System (FAERS) data anonymized by different methods with frequency-based setting of <italic>θ*</italic>. DSR: dangerous sensitivity ratio, NIL: normalized information loss, PPMS: periodical-publishing multisensitive.</p>
              </caption>
              <graphic xlink:href="medinform_v9i10e28752_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
          <sec>
            <title>Attribute Disclosure</title>
            <p>As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>C and D, all data sets anonymized by PPMS-Anonymization are free of attribute disclosure (ie, zero <italic>DSR</italic>). The <italic>DSR</italic>s of MS-Anonymization are very small compared with those in previous settings. It is because those <italic>DSG</italic>s in the previous experiments are caused by high frequent symptoms, whose thresholds, however, are set to 1 in this experiment. In FAERS data, there are more than 20,000 different symptoms. It is hard to determine a suitable threshold for each of them without background knowledge. Therefore, the frequency-based method is a convenient and reasonable way to deal with this issue. This also demonstrates the value of allowing nonuniform settings in our model.</p>
          </sec>
        </sec>
        <sec>
          <title>Level-Wise Confidence Setting</title>
          <p>Again, 2 different <italic>k</italic> (5 and 10) settings are considered, and for the same reason, we omit the results on <italic>DIR</italic>.</p>
          <sec>
            <title>Information Loss</title>
            <p><xref rid="figure4" ref-type="fig">Figure 4</xref>A and B shows that although PPMS and PPMS<sup>+</sup> yield more information loss than that by MS-Anonymization, PPMS<sup>++</sup> behaves comparably to MS-Anonymization. The <italic>NIL</italic>s are very similar to those under the frequency-based setting.</p>
            <fig id="figure4" position="float">
              <label>Figure 4</label>
              <caption>
                <p>Evaluation on information loss and privacy disclosure for Federal Drug Administration Adverse Event Reporting System (FAERS) data anonymized by different methods with level-wise setting of <italic>θ*</italic>. DSR: dangerous sensitivity ratio, NIL: normalized information loss, PPMS: periodical-publishing multisensitive.</p>
              </caption>
              <graphic xlink:href="medinform_v9i10e28752_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
          <sec>
            <title>Attribute Disclosure</title>
            <p>The results in <xref rid="figure4" ref-type="fig">Figure 4</xref>C and D show that all 3 versions of PPMS-Anonymization cause no attribute disclosure (with zero <italic>DSRs</italic>), but large <italic>DSRs</italic> are observed for MS-Anonymization. We can see that the <italic>DSRs</italic> of MS-Anonymization in some quarters are relatively higher, just similar to the results in <xref rid="figure2" ref-type="fig">Figure 2</xref>K and L and <xref rid="figure3" ref-type="fig">Figure 3</xref>C and D.</p>
          </sec>
        </sec>
      </sec>
      <sec>
        <title>Influence on ADR Signals</title>
        <sec>
          <title>Selected Signals</title>
          <p>In this experiment, we inspected variation on the strength of observed ADR signals shown in <xref ref-type="table" rid="table4">Table 4</xref> between before and after anonymization. Because some signals exhibit similar performance, we only show 3 representatives with different demographic conditions, that is, the signals related to Avandia, Zelnorm, and Warfarin, which are shown as follows:</p>
          <p>R1: Avandia, Age&#62;18 → Myocardial infarction</p>
          <p>R2: Zelnorm, Sex=Female → Cerebrovascular accident</p>
          <p>R3: Warfarin, Age&#62;60 → Myocardial infarction</p>
          <p>We calculated its occurrences, PRRs, and compared the values with the original values for each signal. We omit the results for uniform setting <italic>θ*</italic>=0.4 and level-wise setting because similar results were observed for uniform setting <italic>θ*</italic>=0.2 and frequency-based setting, respectively.</p>
          <p>To highlight the impact of anonymization on rare events, we set PRR=0 when <italic>a</italic>&#60;3, where <italic>a</italic> denotes the number of reports that satisfy the specific ADR rule. The threshold <italic>a</italic>≥3 follows Evans et al [<xref ref-type="bibr" rid="ref29">29</xref>], who investigated a group of newly marketed drugs and suggested that the minimum criteria for a signal are <italic>a</italic>≥3 and PRR&#62;2.</p>
          <p>The original count and PRR of these 3 rules are shown in <xref rid="figure5" ref-type="fig">Figure 5</xref>. Rule R1 is a signal with an extremely high occurrence and significant strength, rule R2 is the one with the relatively small occurrence and medium strength, while R3 represents medium occurrence and relatively little strength.</p>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>The original counts and proportional reporting ratios (PRRs) of rules R1, R2, and R3.</p>
            </caption>
            <graphic xlink:href="medinform_v9i10e28752_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Signal Occurrence Variation</title>
          <p>We first evaluated the variation of signal occurrence (count) caused by anonymization. The results are shown in <xref rid="figure6" ref-type="fig">Figure 6</xref>. Notice that there is no result for several quarters (eg, 2007Q1, 2010Q3) under the uniform setting. The reason is the same as that for information loss. Generally, the variation yielded by frequency-based setting is much less than that by uniform setting, and a larger <italic>k</italic> causes more missing counts. For signals with extremely high occurrence like R1, the variation can be substantial; for example, it reaches 180 for PPMS with <italic>k</italic>=10 and uniform confidence setting. In the same case, our PPMS<sup>++</sup> exhibits outstanding performance, only causing variation of less than 10. We also note that some quarters are suffering significant count variation for rule R2 (<xref rid="figure6" ref-type="fig">Figure 6</xref>E-H). This is because the taxonomy of Gender is relatively flat, composed of only 2 levels. Once the gender of a report satisfying this rule is generalized, it will become “Any” and increase the missing count of this rule. For example, in <xref rid="figure6" ref-type="fig">Figure 6</xref>F, when <italic>k</italic>=10, 7 of 11 counts are missing in 2007Q2 for PPMS. In fact, when <italic>k</italic>=10, the ratio of reports with Gender=Any is at least 25% and 45% from 2010Q4 to 2011Q4 for PPMS<sup>+</sup> and PPMS, respectively, which causes serious bias on the count of ADR rule. By contrast, as shown in <xref rid="figure6" ref-type="fig">Figure 6</xref>G and H, the frequency-based setting exhibits lower missing count. The overall situation shows that PPMS<sup>++</sup> significantly outperforms PPMS and PPMS<sup>+</sup>, and demonstrates comparable results with MS-Anonymization.</p>
          <fig id="figure6" position="float">
            <label>Figure 6</label>
            <caption>
              <p>Variations in signal count for different anonymization methods under uniform and frequency-based settings of <italic>θ*</italic>. PPMS: periodical-publishing multisensitive.</p>
            </caption>
            <graphic xlink:href="medinform_v9i10e28752_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Signal Strength Variation</title>
          <p><xref rid="figure7" ref-type="fig">Figure 7</xref> shows the results on the PRR difference. Similar to that observed for occurrence variation, the frequency-based setting yields more negligible PRR difference than that by uniform setting. For rule R1 with enormous strength, the PRR variation is significantly higher than those for rules R2 and R3. The variations caused by PPMS and PPMS<sup>+</sup> fluctuate seriously, sometimes much higher, reaching 5 for <italic>k</italic>=10 and uniform setting of <italic>θ*</italic>; PPMS<sup>++</sup> exhibits relatively small variation under the same situation. For rule R2 with attributes of flat taxonomy, we observe a similar phenomenon. Specifically, a sharply significant variation, reaching –14 (<xref rid="figure7" ref-type="fig">Figure 7</xref>E, F, and H), is observed in 2007Q4 for PPMS and PPMS<sup>+</sup>. This is because the <italic>a</italic> value for computing PRR is less than 3. We observe that the original count of this rule in 2007Q4 (<xref rid="figure5" ref-type="fig">Figure 5</xref>B) is 3 and its original PRR (<xref rid="figure5" ref-type="fig">Figure 5</xref>E) is 13.39. This means that this rule is a rare event with high strength. Any missing count of this rule causes value <italic>a</italic> to be less than 3 and the PRR will become 0, invalidating this rule. This situation demonstrates the impact of generalization on rare but significant ADR rule, especially for attributes with shallow generalization levels such as Gender, which will hinder or delay the discovery of ADR signals.</p>
          <fig id="figure7" position="float">
            <label>Figure 7</label>
            <caption>
              <p>Variations in signal strength (proportional reporting ratio [PRR]) for different anonymization methods under uniform and frequency-based settings of <italic>θ*</italic>. PPMS: periodical-publishing multisensitive.</p>
            </caption>
            <graphic xlink:href="medinform_v9i10e28752_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>In this paper, we have introduced the periodical publishing scenario usually adopted for publishing SRS data. We have presented 3 kinds of attacks, <italic>BFL</italic>-attacks, which exploit the CaseID of records to link the same cases in the series of releases to crack the anonymization by excluding the nontargets to improve the confidence to hit the record target or the sensitive value.</p>
        <p>To prevent the record and attribute disclosure caused by <italic>BFL</italic>-attacks, we have presented a new model called PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding. We have also proposed an algorithm called PPMS-Anonymization to anonymize the raw SRS data set achieving the privacy requirement of PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding. Two enhancements of PPMS-Anonymization, PPMS<sup>+</sup>-Anonymization and PPMS<sup>++</sup>-Anonymization, have also been presented.</p>
        <p>To evaluate the performance of our method, we conducted several experiments with different settings on privacy threshold, from 3 various aspects of evaluation, including information loss, PR, and bias on signal strength. The results showed that our proposed anonymization method, especially PPMS<sup>++</sup>-Anonymization, can effectively prevent <italic>BFL</italic>-attacks caused by follow-up cases across a series of SRS data sets, guarantee the privacy requirement with controlled loss of data utility, and maintain the usability of anonymized SRS data set for ADR detection, especially for frequency-based threshold setting and level-wise setting.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Fostering the development of new detection methods and early discovery of suspected ADR signals is the main driving force for many organizations such as the US FDA to release their SRS data sets to the public. By contrast, evaluating each individual case safety report (ICSR) is necessary for investigating hypothetical signals generated from the SRS data. Unfortunately, due to national privacy regulations such as the Health Insurance Portability and Accountability Act (HIPPA) Privacy Rule [<xref ref-type="bibr" rid="ref30">30</xref>], some specified individual identifiers and narrative were removed from the published FAERS data (following the safe harbor method in Section 164.514 [<xref ref-type="bibr" rid="ref30">30</xref>]). A recent work [<xref ref-type="bibr" rid="ref31">31</xref>] showed that the absence of personal details would significantly affect the assessment of each ICSR. In this context, the published SRS data alone cannot fulfill the purpose of ICSR evaluation. We endeavor to develop an effective privacy protection method for the partially deidentified SRS data (eg, FAERS) without sacrificing the data utility for aggregative disproportionality analysis of suspected ADR signals. How to protect the sharing and access of raw SRS data containing all individually identifiable health information is beyond the scope of this study. Instead, the SRS data organization should provide advanced security schemes, including technical or nontechnical [<xref ref-type="bibr" rid="ref32">32</xref>], to ensure the confidentiality, integrity, and availability of the protected health information for authorized users, as enforced by the HIPPA Security Rule [<xref ref-type="bibr" rid="ref33">33</xref>], which requires a good threat analysis modeling [<xref ref-type="bibr" rid="ref34">34</xref>] before the system design.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>This paper is an extended version of our paper presented at IEEE <italic>ICDE’17</italic> [<xref ref-type="bibr" rid="ref35">35</xref>]. Some new material has been added to clarify the design of the proposed PPMS-Anonymization and its improvement (PPMS+-Anonymization), including the design of the function Generalization (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>), <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>, and <xref rid="figure1" ref-type="fig">Figure 1</xref>. A significantly more efficient version, PPMS++-Anonymization, is proposed. A new way of confidence threshold setting, level-wise setting, was evaluated. Additional more ADR signals were inspected. All experiments were reconducted to include the new version (PPMS++-Anonymization). Overall, PPMS++-Anonymization ensures zero PR on record and attribute linkage, while exhibits 51%-78% and 59%-82% improvements on information loss over PPMS+-Anonymization and PPMS-Anonymization, respectively, and significantly reduces the bias of ADR signal. For example, under the frequency setting, the maximum count bias and PRR bias were reduced from 56 to 3 and 13.4 to 0.1, respectively.</p>
        <p>Based on our work [<xref ref-type="bibr" rid="ref35">35</xref>], Huang et al [<xref ref-type="bibr" rid="ref36">36</xref>] proposed 2 new attacks, <italic>MD</italic>-attack (Medicine Discontinuation attack) and <italic>SS</italic>-attack (Substantial Symptom attack). <italic>MD</italic>-attack assumes the attacker knew when the target stopped his/her treatment, that is, the quarter in which the target’s follow-up record discontinues, while <italic>SS</italic>-attack regards a <italic>QID</italic> group with a substantial amount of adverse reactions risky. Both types of attacks, however, suffer some actuality problems. First, the authors overlooked the phenomenon that an individual’s follow-up records may discontinue for some quarters and reappear in the next quarter. This life span discontinuity of follow-up cases is unpredictable and will thwart the justness of <italic>MD</italic>-attack and the anonymization algorithm. The problem for <italic>SS</italic>-attack is whether knowing someone having many adverse reactions does cause a privacy breach, which needs more convincing evidence. Besides, <italic>SS</italic>-attack is not related to periodical releases of SRS data.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In summary, our PPMS(<italic>k</italic>, <italic>θ*</italic>)-bounding and PPMS-Anonymization can anonymize SRS data sets in the periodical data publishing scenario, preventing the series of releases from the disclosure of sensitive personal information caused by <italic>BFL</italic>-attacks.</p>
        <p>The <italic>BFL</italic>-attacks caused by the existence of CaseID in SRS data is not a particular case in health data. Other types of medical data contain similar features, for example, electronic health records, a digital version of a patient’s paper chart composed of more private information than SRS data. As far as we know, it contains an attribute called patient ID which is similar to CaseID and so may be vulnerable to <italic>BFL</italic>-attacks. We will study this shortly. Some more challenging extensions of this topic include the study of incremental anonymization of data sets published in a cloud environment [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>] and handling a large amount of missing values in SRS data [<xref ref-type="bibr" rid="ref39">39</xref>]. Recently, the emerging differential privacy [<xref ref-type="bibr" rid="ref40">40</xref>-<xref ref-type="bibr" rid="ref42">42</xref>] has been widely recognized as a more rigorous privacy protection method [<xref ref-type="bibr" rid="ref43">43</xref>]. Our recent work [<xref ref-type="bibr" rid="ref44">44</xref>] on integrating differential privacy to anonymize a single release of SRS data has shown promising results. We are currently synergizing the differential privacy to our PPMS(<italic>k</italic>, <italic>θ</italic>*)-bounding to yield a better protection scheme.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>A summary of privacy models for incremental data publishing.</p>
        <media xlink:href="medinform_v9i10e28752_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 59 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Proof of Theorem 1.</p>
        <media xlink:href="medinform_v9i10e28752_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 98 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Proof of Lemma 2.</p>
        <media xlink:href="medinform_v9i10e28752_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 75 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>PPMS-Anonymization.</p>
        <media xlink:href="medinform_v9i10e28752_app4.pdf" xlink:title="PDF File  (Adobe PDF File), 107 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Procedure Grouping.</p>
        <media xlink:href="medinform_v9i10e28752_app5.pdf" xlink:title="PDF File  (Adobe PDF File), 101 KB"/>
      </supplementary-material>
      <supplementary-material id="app6">
        <label>Multimedia Appendix 6</label>
        <p>Function Generalization.</p>
        <media xlink:href="medinform_v9i10e28752_app6.pdf" xlink:title="PDF File  (Adobe PDF File), 90 KB"/>
      </supplementary-material>
      <supplementary-material id="app7">
        <label>Multimedia Appendix 7</label>
        <p>Modification of PPMS-Anonymization to realize PPMS+-Anonymization.</p>
        <media xlink:href="medinform_v9i10e28752_app7.pdf" xlink:title="PDF File  (Adobe PDF File), 93 KB"/>
      </supplementary-material>
      <supplementary-material id="app8">
        <label>Multimedia Appendix 8</label>
        <p>The taxonomy tree of Age.</p>
        <media xlink:href="medinform_v9i10e28752_app8.pdf" xlink:title="PDF File  (Adobe PDF File), 82 KB"/>
      </supplementary-material>
      <supplementary-material id="app9">
        <label>Multimedia Appendix 9</label>
        <p>Modification of PPMS-Anonymization to realize PPMS++-Anonymization.</p>
        <media xlink:href="medinform_v9i10e28752_app9.pdf" xlink:title="PDF File  (Adobe PDF File), 83 KB"/>
      </supplementary-material>
      <supplementary-material id="app10">
        <label>Multimedia Appendix 10</label>
        <p>Description of proportional reporting ratio.</p>
        <media xlink:href="medinform_v9i10e28752_app10.pdf" xlink:title="PDF File  (Adobe PDF File), 30 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ADE</term>
          <def>
            <p>adverse drug event</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ADR</term>
          <def>
            <p>adverse drug reaction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DIG</term>
          <def>
            <p>dangerous identity group</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">DIR</term>
          <def>
            <p>dangerous identity ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DSG</term>
          <def>
            <p>dangerous sensitivity group</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">DSR</term>
          <def>
            <p>dangerous sensitivity ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">FAERS</term>
          <def>
            <p>FDA Adverse Event Reporting System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">FDA</term>
          <def>
            <p>Food and Drug Administration</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">HIPPA</term>
          <def>
            <p>Health Insurance Portability and Accountability Act</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MedDRA</term>
          <def>
            <p>Medical Dictionary for Regulatory Activities</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">MHRA</term>
          <def>
            <p>UK Medicines and Healthcare products Regulatory Agency</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">NIL</term>
          <def>
            <p>normalized information loss</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PPDP</term>
          <def>
            <p>privacy-preserving data publishing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">PPMS</term>
          <def>
            <p>periodical-publishing multisensitive</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">PRR</term>
          <def>
            <p>proportional reporting ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">QID</term>
          <def>
            <p>quasi-identifier</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">SA</term>
          <def>
            <p>sensitive attribute</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">SRS</term>
          <def>
            <p>spontaneous reporting system</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Ministry of Science and Technology of Taiwan under grant no. MOST103-2221-E-390-022.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <source>FDA Adverse Event Reporting System (FAERS)</source>
          <access-date>2017-04-30</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://open.fda.gov/data/faers/">https://open.fda.gov/data/faers/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <source>The Yellow Card Scheme</source>
          <access-date>2015-08-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://yellowcard.mhra.gov.uk">http://yellowcard.mhra.gov.uk</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <source>MedEffect Canada</source>
          <access-date>2015-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.canada.ca/en/health-canada/services/drugs-health-products/medeffect-canada.html">https://www.canada.ca/en/health-canada/services/drugs-health-products/medeffect-canada.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>BCM</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Mehta</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Privacy-preserving data publishing</article-title>
          <source>ACM Comput. Surv</source>
          <year>2010</year>
          <month>06</month>
          <day>01</day>
          <volume>42</volume>
          <issue>4</issue>
          <fpage>1</fpage>
          <lpage>53</lpage>
          <pub-id pub-id-type="doi">10.1145/1749603.1749605</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dankar</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Neisa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jonker</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the risk of patient re-identification from adverse drug event reports</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2013</year>
          <month>10</month>
          <day>05</day>
          <volume>13</volume>
          <fpage>114</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-13-114"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-13-114</pub-id>
          <pub-id pub-id-type="medline">24094134</pub-id>
          <pub-id pub-id-type="pii">1472-6947-13-114</pub-id>
          <pub-id pub-id-type="pmcid">PMC4137558</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>k-anonymity: a model for protecting privacy</article-title>
          <source>Int. J. Unc. Fuzz. Knowl. Based Syst</source>
          <year>2012</year>
          <month>05</month>
          <day>02</day>
          <volume>10</volume>
          <issue>05</issue>
          <fpage>557</fpage>
          <lpage>570</lpage>
          <pub-id pub-id-type="doi">10.1142/S0218488502001648</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>DC</given-names>
            </name>
          </person-group>
          <article-title>On privacy-preserving publishing of spontaneous ADE reporting data</article-title>
          <source>Proceedings of 2013 IEEE International Conference on Bioinformatics and Biomedicine</source>
          <year>2013</year>
          <month>12</month>
          <day>18</day>
          <conf-name>2013 IEEE International Conference on Bioinformatics and Biomedicine</conf-name>
          <conf-date>December 18-21, 2013</conf-date>
          <conf-loc>Shanghai, China</conf-loc>
          <fpage>51</fpage>
          <lpage>53</lpage>
          <pub-id pub-id-type="doi">10.1109/BIBM.2013.6732760</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Machanavajjhala</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kifer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gehrke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Venkitasubramaniam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>L-diversity: privacy beyond k-anonymity</article-title>
          <source>ACM Trans. Knowl. Discov. Data</source>
          <year>2007</year>
          <month>03</month>
          <day>01</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>es</lpage>
          <pub-id pub-id-type="doi">10.1145/1217299.1217302</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>JT</given-names>
            </name>
          </person-group>
          <article-title>Privacy preserving data anonymization of spontaneous ADE reporting system dataset</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2016</year>
          <month>07</month>
          <day>18</day>
          <volume>16 (Suppl 1)</volume>
          <fpage>58</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-016-0293-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-016-0293-4</pub-id>
          <pub-id pub-id-type="medline">27454754</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-016-0293-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC4959360</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>CF</given-names>
            </name>
          </person-group>
          <article-title>Co-training and ensemble based duplicate detection in adverse drug event reporting systems</article-title>
          <source>Proceedings of 2013 IEEE International Conference on Bioinformatics and Biomedicine</source>
          <year>2013</year>
          <month>12</month>
          <day>18</day>
          <conf-name>2013 IEEE International Conference on Bioinformatics and Biomedicine</conf-name>
          <conf-date>December 18-21, 2013</conf-date>
          <conf-loc>Shanghai, China</conf-loc>
          <fpage>7</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1109/bibm.2013.6732591</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tregunno</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Fernandez-Fernandez</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lázaro-Bengoa</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Norén</surname>
              <given-names>GN</given-names>
            </name>
          </person-group>
          <article-title>Performance of probabilistic method to detect duplicate individual case safety reports</article-title>
          <source>Drug Saf</source>
          <year>2014</year>
          <month>03</month>
          <day>14</day>
          <volume>37</volume>
          <issue>4</issue>
          <fpage>249</fpage>
          <lpage>258</lpage>
          <pub-id pub-id-type="doi">10.1007/s40264-014-0146-y</pub-id>
          <pub-id pub-id-type="medline">24627310</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kreimeyer</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Menschik</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Winiecki</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Barash</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Woo</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Alimchandani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Arya</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zinderman</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Forshee</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Botsis</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Using probabilistic record linkage of structured and unstructured dData to identify duplicate cases in spontaneous adverse event reporting systems</article-title>
          <source>Drug Saf</source>
          <year>2017</year>
          <month>03</month>
          <day>14</day>
          <volume>40</volume>
          <issue>7</issue>
          <fpage>571</fpage>
          <lpage>582</lpage>
          <pub-id pub-id-type="doi">10.1007/s40264-017-0523-4</pub-id>
          <pub-id pub-id-type="medline">28293864</pub-id>
          <pub-id pub-id-type="pii">10.1007/s40264-017-0523-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Byun</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bertino</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Secure anonymization for incremental data sets</article-title>
          <year>2006</year>
          <conf-name>The 3rd VLDB Workshop on Secure Data Management</conf-name>
          <conf-date>September 10-11, 2006</conf-date>
          <conf-loc>Seoul, Korea</conf-loc>
          <fpage>48</fpage>
          <lpage>63</lpage>
          <pub-id pub-id-type="doi">10.1007/11844662_4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Byun</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bertino</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Privacy-preserving incremental data dissemination</article-title>
          <source>JCS</source>
          <year>2009</year>
          <month>03</month>
          <day>16</day>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>43</fpage>
          <lpage>68</lpage>
          <pub-id pub-id-type="doi">10.3233/jcs-2009-0316</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Maintaining k-anonymity against incremental updates</article-title>
          <source>Proceedings of the 19th International Conference on Scientific and Statistical Database Management</source>
          <year>2007</year>
          <month>07</month>
          <day>09</day>
          <conf-name>The 19th International Conference on Scientific and Statistical Database Management</conf-name>
          <conf-date>July 9-11, 2007</conf-date>
          <conf-loc>Banff, Canada</conf-loc>
          <fpage>5</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1109/ssdbm.2007.16</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>BCM</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>AWC</given-names>
            </name>
            <name name-style="western">
              <surname>Pei</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Anonymity for continuous data publishing</article-title>
          <source>Proceedings of the 11th International Conference on Extending Database Technology</source>
          <year>2008</year>
          <month>03</month>
          <day>25</day>
          <conf-name>The 11th International Conference on Extending Database Technology</conf-name>
          <conf-date>March 25-29, 2008</conf-date>
          <conf-loc>Nantes, France</conf-loc>
          <fpage>264</fpage>
          <lpage>275</lpage>
          <pub-id pub-id-type="doi">10.1145/1353343.1353378</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>M-invariance: towards privacy preserving re-publication of dynamic data sets</article-title>
          <source>Proceedings of 2007 ACM SIGMOD International Conference on Management of Data</source>
          <year>2007</year>
          <month>06</month>
          <day>11</day>
          <conf-name>The 2007 ACM SIGMOD International Conference on Management of Data</conf-name>
          <conf-date>June 11-14, 2007</conf-date>
          <conf-loc>Beijing, China</conf-loc>
          <fpage>689</fpage>
          <lpage>700</lpage>
          <pub-id pub-id-type="doi">10.1145/1247480.1247556</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>AWC</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>RCW</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Privacy preserving serial data publishing by role composition</article-title>
          <source>Proc. VLDB Endow</source>
          <year>2008</year>
          <month>08</month>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>845</fpage>
          <lpage>856</lpage>
          <pub-id pub-id-type="doi">10.14778/1453856.1453948</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Challenging more updates: towards anonymous re-publication of fully dynamic data sets</article-title>
          <source>arXiv</source>
          <year>2008</year>
          <month>06</month>
          <day>28</day>
          <access-date>2021-05-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/0806.4703">https://arxiv.org/abs/0806.4703</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Anjum</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Raschia</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gelgon</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Malik</surname>
              <given-names>SUR</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmad</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Suhail</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Alam</surname>
              <given-names>MM</given-names>
            </name>
          </person-group>
          <article-title>τ -safety: A privacy model for sequential publication with arbitrary updates</article-title>
          <source>Computers &#38; Security</source>
          <year>2017</year>
          <month>05</month>
          <volume>66</volume>
          <fpage>20</fpage>
          <lpage>39</lpage>
          <pub-id pub-id-type="doi">10.1016/j.cose.2016.12.014</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Barman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Naughton</surname>
              <given-names>JF</given-names>
            </name>
          </person-group>
          <article-title>Preventing equivalence attacks in updated, anonymized data</article-title>
          <source>Proceedings of the 27th IEEE International Conference on Data Engineering</source>
          <year>2011</year>
          <month>04</month>
          <conf-name>The 27th IEEE International Conference on Data Engineering</conf-name>
          <conf-date>April 11-16, 2011</conf-date>
          <conf-loc>Hannover, Germany</conf-loc>
          <fpage>529</fpage>
          <lpage>540</lpage>
          <pub-id pub-id-type="doi">10.1109/icde.2011.5767924</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bewong</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Privacy preserving serial publication of transactional data</article-title>
          <source>Information Systems</source>
          <year>2019</year>
          <month>05</month>
          <volume>82</volume>
          <fpage>53</fpage>
          <lpage>70</lpage>
          <pub-id pub-id-type="doi">10.1016/j.is.2019.01.001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>BCM</given-names>
            </name>
          </person-group>
          <article-title>Anonymizing sequential release</article-title>
          <source>Proceedings of the 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source>
          <year>2006</year>
          <month>08</month>
          <conf-name>The 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>August 20-23, 2006</conf-date>
          <conf-loc>New York, NY</conf-loc>
          <fpage>414</fpage>
          <lpage>423</lpage>
          <pub-id pub-id-type="doi">10.1145/1150402.1150449</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shmueli</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Tassa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wasserstein</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shapira</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rokach</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Limiting disclosure of sensitive data in sequential releases of databases</article-title>
          <source>Information Sciences</source>
          <year>2012</year>
          <month>5</month>
          <volume>191</volume>
          <fpage>98</fpage>
          <lpage>127</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ins.2011.12.020</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shmueli</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Tassa</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Privacy by diversity in sequential releases of databases</article-title>
          <source>Information Sciences</source>
          <year>2015</year>
          <month>03</month>
          <volume>298</volume>
          <fpage>344</fpage>
          <lpage>372</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ins.2014.11.005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Byun</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Kamra</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bertino</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Efficient k-anonymization using clustering techniques</article-title>
          <source>Proceedings of the 12th International Conference on Database Systems for Advanced Applications</source>
          <year>2007</year>
          <month>04</month>
          <conf-name>The 12th International Conference on Database Systems for Advanced Applications</conf-name>
          <conf-date>April 9-12, 2007</conf-date>
          <conf-loc>Bangkok, Thailand</conf-loc>
          <fpage>188</fpage>
          <lpage>200</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-540-71703-4_18</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <source>Medical Subject Headings (MeSH)</source>
          <access-date>2017-03-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.ncbi.nlm.nih.gov/mesh/">http://www.ncbi.nlm.nih.gov/mesh/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <source>MedWatch</source>
          <access-date>2015-08-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.fda.gov/Safety/MedWatch/">http://www.fda.gov/Safety/MedWatch/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Evans</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Waller</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Use of proportional reporting ratios (PRRs) for signal generation from spontaneous adverse drug reaction reports</article-title>
          <source>Pharmacoepidemiol Drug Saf</source>
          <year>2001</year>
          <month>12</month>
          <day>10</day>
          <volume>10</volume>
          <issue>6</issue>
          <fpage>483</fpage>
          <lpage>486</lpage>
          <pub-id pub-id-type="doi">10.1002/pds.677</pub-id>
          <pub-id pub-id-type="medline">11828828</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <article-title>Guidance Regarding Methods for De-identification of Protected Health Information in Accordance with the Health Insurance Portability and Accountability Act (HIPAA) Privacy Rule</article-title>
          <source>Office for Civil Rights</source>
          <year>2012</year>
          <month>11</month>
          <access-date>2021-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.hhs.gov/sites/default/files/ocr/privacy/hipaa/understanding/coveredentities/De-identification/hhs_deid_guidance.pdf">https://www.hhs.gov/sites/default/files/ocr/privacy/hipaa/understanding/coveredentities/De-identification/hhs_deid_guidance.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marwitz</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Kortepeter</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Dal Pan</surname>
              <given-names>GJ</given-names>
            </name>
            <name name-style="western">
              <surname>Muñoz</surname>
              <given-names>Monica A</given-names>
            </name>
          </person-group>
          <article-title>An evaluation of postmarketing reports with an outcome of death in the US FDA adverse event reporting System</article-title>
          <source>Drug Saf</source>
          <year>2020</year>
          <month>05</month>
          <volume>43</volume>
          <issue>5</issue>
          <fpage>457</fpage>
          <lpage>465</lpage>
          <pub-id pub-id-type="doi">10.1007/s40264-020-00908-5</pub-id>
          <pub-id pub-id-type="medline">31981082</pub-id>
          <pub-id pub-id-type="pii">10.1007/s40264-020-00908-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Scheibner</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Raisaro</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Troncoso-Pastoriza</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Ienca</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fellay</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vayena</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hubaux</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Revolutionizing medical data sharing using advanced privacy-enhancing technologies: technical, legal, and ethical synthesis</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <month>02</month>
          <day>25</day>
          <volume>23</volume>
          <issue>2</issue>
          <fpage>e25120</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/2/e25120/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/25120</pub-id>
          <pub-id pub-id-type="medline">33629963</pub-id>
          <pub-id pub-id-type="pii">v23i2e25120</pub-id>
          <pub-id pub-id-type="pmcid">PMC7952236</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <article-title>Summary of the HIPAA Security Rule</article-title>
          <source>Office for Civil Rights</source>
          <year>2013</year>
          <month>07</month>
          <access-date>2021-06-17</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.hhs.gov/hipaa/for-professionals/security/laws-regulations/index.html">https://www.hhs.gov/hipaa/for-professionals/security/laws-regulations/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shevchenko</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chick</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>O’Riordan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Scanlon</surname>
              <given-names>TP</given-names>
            </name>
            <name name-style="western">
              <surname>Woody</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <source>Threat Modeling: A Summary of Available Methods</source>
          <access-date>2021-07-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://resources.sei.cmu.edu/asset_files/WhitePaper/2018_019_001_524597.pdf">https://resources.sei.cmu.edu/asset_files/WhitePaper/2018_019_001_524597.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>WY</given-names>
            </name>
          </person-group>
          <article-title>Privacy preserving anonymity for periodical SRS data publishing</article-title>
          <source>Proceedings of the 33rd IEEE International Conference on Data Engineering</source>
          <year>2017</year>
          <month>04</month>
          <conf-name>The 33rd IEEE International Conference on Data Engineering</conf-name>
          <conf-date>April 19-22, 2017</conf-date>
          <conf-loc>San Diego, CA</conf-loc>
          <fpage>1344</fpage>
          <lpage>1355</lpage>
          <pub-id pub-id-type="doi">10.1109/icde.2017.176</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Improved privacy preserving method for periodical SRS publishing</article-title>
          <source>PLoS One</source>
          <year>2021</year>
          <month>04</month>
          <day>22</day>
          <volume>16</volume>
          <issue>4</issue>
          <fpage>e0250457</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0250457"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0250457</pub-id>
          <pub-id pub-id-type="medline">33886662</pub-id>
          <pub-id pub-id-type="pii">PONE-D-20-35149</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aldeen</surname>
              <given-names>YAAS</given-names>
            </name>
            <name name-style="western">
              <surname>Salleh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aljeroudi</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>An innovative privacy preserving technique for incremental datasets on cloud computing</article-title>
          <source>J Biomed Inform</source>
          <year>2016</year>
          <month>08</month>
          <volume>62</volume>
          <fpage>107</fpage>
          <lpage>116</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(16)30054-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2016.06.011</pub-id>
          <pub-id pub-id-type="medline">27369566</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(16)30054-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Seo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Moon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joo</surname>
              <given-names>HJ</given-names>
            </name>
          </person-group>
          <article-title>Proposal and assessment of a de-identification strategy to enhance anonymity of the observational medical outcomes partnership common data model (OMOP-CDM) in a public cloud-computing environment: anonymization of medical data using privacy models</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>11</month>
          <day>26</day>
          <volume>22</volume>
          <issue>11</issue>
          <fpage>e19597</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/11/e19597/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/19597</pub-id>
          <pub-id pub-id-type="medline">33177037</pub-id>
          <pub-id pub-id-type="pii">v22i11e19597</pub-id>
          <pub-id pub-id-type="pmcid">PMC7728527</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hsiao</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>ZX</given-names>
            </name>
          </person-group>
          <article-title>On anonymizing medical microdata with large-scale missing values - A case study with the FAERS dataset</article-title>
          <source>Proceedings of the 41st Annual International Conference of the IEEE Engineering in Medicine &#38; Biology Society</source>
          <year>2019</year>
          <month>07</month>
          <conf-name>The 41st Annual International Conference of the IEEE Engineering in Medicine &#38; Biology Society</conf-name>
          <conf-date>July 23–27, 2019</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>6505</fpage>
          <lpage>6508</lpage>
          <pub-id pub-id-type="doi">10.1109/EMBC.2019.8857025</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dwork</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Differential privacy</article-title>
          <source>Proceedings of the 33rd International Conference on Automata, Languages and Programming</source>
          <year>2006</year>
          <month>07</month>
          <conf-name>The 33rd International Conference on Automata, Languages and Programming</conf-name>
          <conf-date>July 10-14, 2006</conf-date>
          <conf-loc>Venice, Italy</conf-loc>
          <fpage>1</fpage>
          <lpage>12</lpage>
          <pub-id pub-id-type="doi">10.1007/11787006_1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Generalized Gaussian Mechanism for Differential Privacy</article-title>
          <source>IEEE Trans. Knowl. Data Eng</source>
          <year>2019</year>
          <month>4</month>
          <day>1</day>
          <volume>31</volume>
          <issue>4</issue>
          <fpage>747</fpage>
          <lpage>756</lpage>
          <pub-id pub-id-type="doi">10.1109/tkde.2018.2845388</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Impact of inaccurate data on differential privacy</article-title>
          <source>Computers &#38; Security</source>
          <year>2019</year>
          <month>05</month>
          <volume>82</volume>
          <fpage>68</fpage>
          <lpage>79</lpage>
          <pub-id pub-id-type="doi">10.1016/j.cose.2018.12.007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Desfontaines</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pejó</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>SoK: Differential privacies</article-title>
          <source>Proceedings on Privacy Enhancing Technologies</source>
          <year>2020</year>
          <month>05</month>
          <volume>2</volume>
          <fpage>288</fpage>
          <lpage>313</lpage>
          <pub-id pub-id-type="doi">10.2478/popets-2020-0028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>ZX</given-names>
            </name>
          </person-group>
          <article-title>Embracing differential privacy for anonymizing spontaneous ADE reporting data</article-title>
          <source>Proceedings of the 2020 IEEE International Conference on Bioinformatics and Biomedicine</source>
          <year>2020</year>
          <conf-name>The 2020 IEEE International Conference on Bioinformatics and Biomedicine</conf-name>
          <conf-date>December 16-19, 2020</conf-date>
          <conf-loc>Seoul, Korea</conf-loc>
          <fpage>2015</fpage>
          <lpage>2022</lpage>
          <pub-id pub-id-type="doi">10.1109/bibm49941.2020.9313578</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
