Merge pull request #1 from IFCA-Advanced-Computing/develop

judithspd · web-flow · commit ad21e804332f · 2024-05-13T11:26:14.000+02:00
Update to version 0.0.3
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,13 @@
+cff-version: 0.0.2
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Sáinz-Pardo Díaz"
+  given-names: "Judith"
+  orcid: "https://orcid.org/0000-0002-8387-578X"
+- family-names: "López García"
+  given-names: "Álvaro"
+  orcid: "https://orcid.org/0000-0002-0013-4602"
+title: "ANJANA"
+version: 0.0.3
+date-released: 2024-04-18
+url: "https://github.com/IFCA-Advanced-Computing/anjana"
diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/README.md b/README.md
@@ -1,7 +1,9 @@
 # ANJANA
 [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-green.svg)](https://gitlab.ifca.es/privacy-security/anjana/-/blob/main/LICENSE)
 [![codecov](https://codecov.io/gh/IFCA-Advanced-Computing/anjana/graph/badge.svg?token=AVI53GZ7YD)](https://codecov.io/gh/IFCA-Advanced-Computing/anjana)
-
+![PyPI](https://img.shields.io/pypi/v/anjana)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/anjana)
+[![Documentation Status](https://readthedocs.org/projects/anjana/badge/?version=latest)](https://anjana.readthedocs.io/en/latest/?badge=latest)
 ![Python version](https://img.shields.io/badge/python-3.9|3.10|3.11|3.12-blue)
 
 
@@ -20,7 +22,7 @@ The following anonymity techniques are implemented, based on the Python library
 * _Enhanced β-likeness_.
 * _δ-disclosure privacy_.
 
-## :bulb: Installation
+## Installation
 First, we strongly recommend the use of a virtual environment. In linux: 
 ```bash
 virtualenv .venv -p python3
@@ -42,15 +44,15 @@ Install the most updated version of anjana (linux and windows):
 pip install git+https://github.com/IFCA-Advanced-Computing/anjana.git
 ```
 
-## :rocket: Getting started
+## Getting started
 
 For anonymizing your data you need to introduce:
-* The **pandas dataframe** with the data to be anonymized. Each column can contain: indentifiers, quasi-indentifiers or sensitive attributes.
+* The **pandas dataframe** with the data to be anonymized. Each column can contain: identifiers, quasi-indentifiers or sensitive attributes.
 * The **list with the names of the identifiers** in the dataframe, in order to suppress them.
 * The **list with the names of the quasi-identifiers** in the dataframe.
 * The **sentive attribute** (only one) in case of applying other techniques than _k-anonymity_.
 * The **level of anonymity to be applied**, e.g. _k_ (for _k-anonymity_), _ℓ_ (for _ℓ-diversity_), _t_ (for _t-closeness_), _β_ (for _basic or enhanced β-likeness_), etc.
-* Maximum **level of record suppression** allowed (from 0 to 100).
+* Maximum **level of record suppression** allowed (from 0 to 100, acting as the percentage of suppressed records).
 * Dictionary containing one dictionary for each quasi-identifier with the **hierarchies** and the levels.
 
 ### Example: apply _k-anonymity_, _ℓ-diversity_ and _t-closeness_ to the [adult dataset](https://archive.ics.uci.edu/dataset/2/adult) with some predefined hierarchies:
@@ -137,6 +139,8 @@ For a better understanding, let's look at the following example. Supose that we
 Then, in order to create the hierarquies we can define the following dictionary:
 
 ```python
+import numpy as np
+
 age = data['age'].values
 # Values: [29 24 28 27 24 23 19 29 17 19] (note that the following can be automatized)
 age_5years = ['[25, 30)', '[20, 25)', '[25, 30)',
@@ -160,10 +164,34 @@ hierarchies = {
 }
 ```
 
-## :scroll: License
-This project is licensed under the [Apache 2.0 license](https://gitlab.ifca.es/privacy-security/anjana/-/blob/main/LICENSE?ref_type=heads).
+You can also use the function _generate_intervals()_ from _utils_ for creating the interval-based hierarchy as follows:
+
+```python
+import numpy as np
+from anjana.anonymity import utils
+
+age = data['age'].values
+
+hierarchies = {
+    "age": {
+        0: data["age"].values,
+        1: utils.generate_intervals(data["age"].values, 0, 100, 5),
+        2: utils.generate_intervals(data["age"].values, 0, 100, 10),
+    },
+    "gender": {
+        0: data["gender"].values,
+        1: np.array(["*"] * len(data["gender"].values)) # Suppression
+    },
+    "city": {0: data["city"].values,
+             1: np.array(["*"] * len(data["city"].values))} # Suppression
+}
+```
+
+
+## License
+This project is licensed under the [Apache 2.0 license](https://github.com/IFCA-Advanced-Computing/anjana/blob/main/LICENSE).
 
-## :warning: Project status
+## Project status
 This project is under active development.
 
 ## Funding and acknowledgments
diff --git a/anjana/__init__.py b/anjana/__init__.py
@@ -16,4 +16,4 @@
 
 """ANJANA is an open source framework for anonymizing data with different techniques."""
 
-__version__ = "0.0.2"
+__version__ = "0.0.3"
diff --git a/anjana/anonymity/_delta_disclosure.py b/anjana/anonymity/_delta_disclosure.py
@@ -79,7 +79,7 @@ def delta_disclosure(
     quasi_ident_gen = copy(quasi_ident)
 
     if delta_real <= delta:
-        print(f"The data verifies delta-disclosure with t={delta_real}")
+        print(f"The data verifies delta-disclosure with delta={delta_real}")
         return data_kanon
 
     while delta_real > delta:
diff --git a/anjana/anonymity/_k_anonymity.py b/anjana/anonymity/_k_anonymity.py
@@ -153,6 +153,7 @@ def alpha_k_anonymity(
             data_kanon, quasi_ident
         )
 
+        k_ec = []
         alpha_ec = []
         for ec in equiv_class:
             data_temp = data_kanon.iloc[
@@ -164,14 +165,17 @@ def alpha_k_anonymity(
                 for s in values
             ]
             alpha_ec.append(max(alpha_s))
+            k_ec.append(len(ec))
 
         if alpha > min(alpha_ec):
             if max(alpha_ec) <= alpha:
                 return data_kanon
 
-            data_ec = pd.DataFrame({"equiv_class": equiv_class, "alpha": alpha_ec})
+            data_ec = pd.DataFrame(
+                {"equiv_class": equiv_class, "alpha": alpha_ec, "k": k_ec}
+            )
             data_ec_alpha = data_ec[data_ec.alpha > alpha]
-            records_sup = sum(data_ec_alpha.alpha.values)
+            records_sup = sum(data_ec_alpha.k.values)
             if (records_sup + supp_records) * 100 / len(data) <= supp_level:
                 ec_elim = np.concatenate(
                     [
diff --git a/anjana/anonymity/_l_diversity.py b/anjana/anonymity/_l_diversity.py
@@ -254,6 +254,7 @@ def recursive_c_l_diversity(
         equiv_class = pycanon.anonymity.utils.aux_anonymity.get_equiv_class(
             data_kanon, quasi_ident
         )
+        k_ec = []
         c_ec = []
         for ec in equiv_class:
             data_temp = data_kanon.iloc[
@@ -262,12 +263,15 @@ def recursive_c_l_diversity(
             values = np.unique(data_temp[sens_att].values)
             r_ec = np.sort([len(data_temp[data_temp[sens_att] == s]) for s in values])
             c_ec.append(np.floor(r_ec[0] / sum(r_ec[(l_div - 1) :]) + 1))
+            k_ec.append(len(ec))
             if max(c_ec) < c:
                 f"Recursive (c,l)-diversity cannot be achieved for l={l_div} and c={c}"
             else:
-                data_ec = pd.DataFrame({"equiv_class": equiv_class, "c_ec": c_ec})
+                data_ec = pd.DataFrame(
+                    {"equiv_class": equiv_class, "c_ec": c_ec, "k": k_ec}
+                )
                 data_ec_c = data_ec[data_ec.c_ec < c]
-                records_sup = sum(data_ec_c.c_ec.values)
+                records_sup = sum(data_ec_c.k.values)
                 if (records_sup + supp_records) * 100 / len(data) <= supp_level:
                     ec_elim = np.concatenate(
                         [
@@ -358,11 +362,14 @@ def _l_diversity_inner(
         ec_sensitivity = [
             len(np.unique(data_kanon.iloc[ec][sens_att])) for ec in equiv_class
         ]
+        k_ec = [len(ec) for ec in equiv_class]
 
         if l_div > max(ec_sensitivity):
-            data_ec = pd.DataFrame({"equiv_class": equiv_class, "l": ec_sensitivity})
+            data_ec = pd.DataFrame(
+                {"equiv_class": equiv_class, "l": ec_sensitivity, "k": k_ec}
+            )
             data_ec_l = data_ec[data_ec.l < l_div]
-            records_sup = sum(data_ec_l.l.values)
+            records_sup = sum(data_ec_l.k.values)
             if (records_sup + supp_records_k) * 100 / len(data) <= supp_level:
                 ec_elim = np.concatenate(
                     [
diff --git a/anjana/anonymity/utils/__init__.py b/anjana/anonymity/utils/__init__.py
@@ -20,11 +20,13 @@
     apply_hierarchy,
     check_gen_level,
     get_transformation,
+    generate_intervals,
 )
 
 __all__ = [
     "suppress_identifiers",
     "apply_hierarchy",
     "check_gen_level",
     "get_transformation",
+    "generate_intervals",
 ]
diff --git a/anjana/anonymity/utils/utils.py b/anjana/anonymity/utils/utils.py
@@ -149,3 +149,41 @@ def get_transformation(
             transformation.append(0)
 
     return transformation
+
+
+@beartype()
+def generate_intervals(
+    quasi_ident: typing.Union[typing.List, np.ndarray],
+    inf: typing.Union[int, float],
+    sup: typing.Union[int, float],
+    step: int,
+) -> list:
+    """Given a quasi-identifier of numeric type, creates a list containing an
+    interval-based generalization (hierarchy) of the values of the quasi-identifier.
+    The intervals will have the length entered in the parameter step.
+
+    :param quasi_ident: values of the quasi-identifier on which the interval-based
+        generalization is to be obtained
+    :type quasi_ident: list or numpy array
+
+    :param inf: lower value of the set of intervals
+    :type inf: int or float
+
+    :param sup: bigger value of the set of intervals
+    :type sup: int or float
+
+    :param step: spacing between values of the intervals
+    :type step: int
+
+    :return: list with the intervals associated with the given values
+    :rtype: list
+    """
+    values = np.arange(inf, sup + 1, step)
+    interval = []
+    for num in quasi_ident:
+        lower = np.searchsorted(values, num)
+        if lower == 0:
+            lower = 1
+        interval.append(f"[{values[lower - 1]}, {values[lower]})")
+
+    return interval
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -19,7 +19,7 @@
 project = "ANJANA"
 copyright = "2024, Spanish National Research Council (CSIC)"
 author = "Judith Sáinz-Pardo Díaz (CSIC)"
-release = "0.0.2"
+release = "0.0.3"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
@@ -1,7 +1,7 @@
 Getting started
 ###############
 
-Example with the `adult dataset`_, anonymizing using three techniques: k-anonymity, :math:`\ell`-diversity and t-closeness (the data and hierarquies can be found in the `examples folder of the repository`_):
+Example with the `adult dataset`_, anonymizing using three techniques: k-anonymity, :math:`\ell`-diversity and t-closeness (the data and hierarchies can be found in the `examples folder of the repository`_):
 
 .. code-block:: python
 
@@ -98,7 +98,7 @@ For a better understanding, let's look at the following example. Supose that we
 | John      | 19  | Male   | Kerala     | Viral infection |
 +-----------+-----+--------+------------+-----------------+
 
-Then, in order to create the hierarquies we can define the following dictionary:
+Then, in order to create the hierarchies we can define the following dictionary:
 
 .. code-block:: python
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -2,7 +2,7 @@ ANJANA
 =============================================================================
 
 ANJANA is a `Python`_ library which allows the application of different anonymity
-techiniques based on a set of identifiers, quasi-identifiers (QI) and a sensitive 
+techniques based on a set of identifiers, quasi-identifiers (QI) and a sensitive 
 attribute. It's easy to use and fast. 
 The following anonymity techniques can be applied:
 
diff --git a/examples/adult.py b/examples/adult.py
@@ -63,3 +63,11 @@
 # Value of k calculated: 10
 
 data_anon.to_csv("adult_k10.csv")
+
+print(f"Number of records suppressed: {len(data) - len(data_anon)}")
+print(
+    f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
+)
+
+# Number of records suppressed: 14234
+# Percentage of records suppressed: 43.71487362181751 %
diff --git a/examples/adult_alpha_k_anonymity.py b/examples/adult_alpha_k_anonymity.py
@@ -68,3 +68,10 @@
 # Elapsed time: 1.1014823913574219
 # Value of k calculated: 10
 # Value of alpha calculated: 0.8
+
+print(f"Number of records suppressed: {len(data) - len(data_anon)}")
+print(
+    f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
+)
+# Number of records suppressed: 14234
+# Percentage of records suppressed: 43.71487362181751 %
diff --git a/examples/adult_basic_beta_likeness.py b/examples/adult_basic_beta_likeness.py
@@ -44,7 +44,7 @@
 sens_att = "salary-class"
 k = 10
 beta = 0.5
-supp_level = 50
+supp_level = 100
 
 hierarchies = {
     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
@@ -68,5 +68,13 @@
 )
 
 # Elapsed time: 1.1014823913574219
-# Value of k calculated: 4950
-# Value of beta (basic) calculated: 0.28299597682058103
+# Value of k calculated: 2098
+# Value of beta (basic) calculated: 0.41781323480116844
+
+print(f"Number of records suppressed: {len(data) - len(data_anon)}")
+print(
+    f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
+)
+
+# Number of records suppressed: 23686
+# Percentage of records suppressed: 72.74346610976322 %
diff --git a/examples/adult_delta_disclosure.py b/examples/adult_delta_disclosure.py
@@ -46,6 +46,20 @@
 delta = 3
 supp_level = 50
 
+all_cols = [
+    "age",
+    "education",
+    "marital-status",
+    "occupation",
+    "sex",
+    "native-country",
+    "race",
+    "salary-class",
+]
+sample = data.sample(n=15)
+sample = sample.loc[:, all_cols]
+sample.to_csv("test.csv")
+
 hierarchies = {
     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
@@ -67,6 +81,14 @@
     f"{pycanon.anonymity.delta_disclosure(data_anon, quasi_ident, [sens_att])}"
 )
 
-# Elapsed time: 1.1014823913574219
+# Elapsed time: 4.623609304428101
 # Value of k calculated: 392
 # Value of delta calculated: 2.159243878369523
+
+print(f"Number of records suppressed: {len(data) - len(data_anon)}")
+print(
+    f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
+)
+
+# Number of records suppressed: 14234
+# Percentage of records suppressed: 43.71487362181751 %
diff --git a/examples/adult_enhanced_beta_likeness.py b/examples/adult_enhanced_beta_likeness.py
@@ -44,7 +44,7 @@
 sens_att = "salary-class"
 k = 10
 beta = 0.5
-supp_level = 50
+supp_level = 100
 
 hierarchies = {
     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
@@ -67,6 +67,14 @@
     f"{pycanon.anonymity.enhanced_beta_likeness(data_anon, quasi_ident, [sens_att])}"
 )
 
-# Elapsed time: 1.1014823913574219
-# Value of k calculated: 4950
-# Value of beta (enhanced) calculated: 0.28299597682058103
+# Elapsed time: 2.7565865516662598
+# Value of k calculated: 2098
+# Value of beta (enhanced) calculated: 0.41781323480116844
+
+print(f"Number of records suppressed: {len(data) - len(data_anon)}")
+print(
+    f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
+)
+
+# Number of records suppressed: 23686
+# Percentage of records suppressed: 72.74346610976322 %
diff --git a/examples/adult_ldiversity.py b/examples/adult_ldiversity.py
diff --git a/examples/adult_tcloseness.py b/examples/adult_tcloseness.py
diff --git a/examples/hospital.py b/examples/hospital.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_anonymity.py b/tests/test_anonymity.py
diff --git a/tests/test_unitary.py b/tests/test_unitary.py

Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,4 @@`
`16`	`16`
`17`	`17`	`"""ANJANA is an open source framework for anonymizing data with different techniques."""`
`18`	`18`
`19`		`-__version__ = "0.0.2"`
	`19`	`+__version__ = "0.0.3"`
Original file line number	Diff line number	Diff line change
`@@ -20,11 +20,13 @@`
`20`	`20`	`apply_hierarchy,`
`21`	`21`	`check_gen_level,`
`22`	`22`	`get_transformation,`
	`23`	`+ generate_intervals,`
`23`	`24`	`)`
`24`	`25`
`25`	`26`	`__all__ = [`
`26`	`27`	`"suppress_identifiers",`
`27`	`28`	`"apply_hierarchy",`
`28`	`29`	`"check_gen_level",`
`29`	`30`	`"get_transformation",`
	`31`	`+ "generate_intervals",`
`30`	`32`	`]`