Source code for guardian_ai.privacy_estimation.merlin_attack

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator

from guardian_ai.privacy_estimation.attack import AttackType, BlackBoxAttack
from guardian_ai.privacy_estimation.model import TargetModel
from guardian_ai.privacy_estimation.utils import log_loss_vector



[docs]
class MerlinAttack(BlackBoxAttack):
    """
    Implements the Merlin Attack as described in the paper: Revisiting Membership Inference
    Under Realistic Assumptions by Jayaraman et al.
    The main idea is to perturb a data point, and calculate noise on all the data points in
    this neighborhood. If the loss of large fraction of these points is above the target point,
    it might imply that the target point is in a local minima, and therefore the model might
    have fitted around it, implying it might have seen it at training time.
    """

    def __init__(
        self,
        attack_model: BaseEstimator,
        noise_type: str = "gaussian",
        noise_coverage: str = "full",
        noise_magnitude: float = 0.01,
        max_t: int = 50,
    ):
        """
        These default values are mostly taken from the original implementation of this attack.

        Parameters
        ----------
        attack_model: sklearn.base.BaseEstimator
            The type of attack model to be used.
            Typically, it's ThresholdClassifier.
        noise_type: str
            Choose the type of noise to add based on the data.
            Supports uniform and gaussian.
        noise_coverage: str
            Add noise to all attributes ("full") or only a subset.
        noise_magnitude: float
            Size of the noise.
        max_t: int
            The number of noisy points to generate to calculate the Merlin Ratio.

        """
        self.noise_type = noise_type
        self.noise_coverage = noise_coverage
        self.noise_magnitude = noise_magnitude
        self.max_t = max_t
        super(MerlinAttack, self).__init__(attack_model, name=AttackType.MerlinAttack.name)


[docs]
    def generate_noise(self, shape: np.shape, dtype):
        """
        Generate noise to be added to the target data point.

        Parameters
        ----------
        shape: : np.shape
            Shape of the target data point
        dtype: np.dtype
            Datatype of the target data point

        Returns
        -------
        {array-like}
            Noise generated according to the parameters to match the shape of the target.

        """
        noise = np.zeros(shape, dtype=dtype)
        if self.noise_coverage == "full":
            if self.noise_type == "uniform":
                noise = np.array(
                    np.random.uniform(0, self.noise_magnitude, size=shape), dtype=dtype
                )
            else:
                noise = np.array(np.random.normal(0, self.noise_magnitude, size=shape), dtype=dtype)
        else:
            attr = np.random.randint(shape[1])
            if self.noise_type == "uniform":
                noise[:, attr] = np.array(
                    np.random.uniform(0, self.noise_magnitude, size=shape[0]),
                    dtype=dtype,
                )
            else:
                noise[:, attr] = np.array(
                    np.random.normal(0, self.noise_magnitude, size=shape[0]),
                    dtype=dtype,
                )
        return noise



[docs]
    def get_merlin_ratio(self, target_model: TargetModel, X_attack, y_attack):
        """
        Returns the merlin-ratio for the Merlin attack.

        Parameters
        ----------
        target_model: guardian_ai.privacy_estimation.model.TargetModel
            Model that is being targeted by the attack.
        X_attack: {array-like, sparse matrix} of shape (n_samples, n_features)
            Input features of the attack datapoints, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        y_attack: ndarray of shape (n_samples,)
            Vector containing the  output labels of the attack data points (not membership label).

        Returns
        -------
        float
            Merlin Ratio. Value between 0 and 1.

        """

        labels = target_model.model.classes_
        pred_y = target_model.get_prediction_probs(X_attack)
        my_per_instance_loss = log_loss_vector(y_attack, pred_y, labels=labels)
        counts = np.zeros((X_attack).shape[0])
        for _t in range(self.max_t):
            noise = self.generate_noise(X_attack.shape, X_attack.dtype)
            if sp.issparse(X_attack):
                noise = sp.csr_matrix(noise)
            noisy_x = X_attack + noise
            predictions = target_model.get_prediction_probs(noisy_x)
            my_noisy_per_instance_loss = log_loss_vector(y_attack, predictions, labels=labels)
            counts += np.where(my_noisy_per_instance_loss > my_per_instance_loss, 1, 0)
        return counts / self.max_t



[docs]
    def transform_attack_data(
        self,
        target_model: TargetModel,
        X_attack,
        y_attack,
        split_type: str = None,
        use_cache=False,
    ):
        """
        Overriding the method transform_attack_data from the base class.
        Calculates the  merlin ratio.

        Parameters
        ----------
        target_model: guardian_ai.privacy_estimation.model.TargetModel
            Target model being attacked.
        X_attack: {array-like, sparse matrix} of shape (n_samples, n_features)
            Input features of the attack datapoints, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.
        y_attack: ndarray of shape (n_samples,)
            Vector containing the  output labels of the attack data points (not membership label).
        split_type: str
            Use information cached from running the loss based and merlin attacks.
        use_cache: bool
            Using the cache or not.

        Returns
        -------
        X_membership:  {array-like, sparse matrix} of shape (n_samples, n_features),
            where ``n_samples`` is the number of samples and ``n_features`` is
            the number of features.
            Input feature for the attack model - in this case, the Merlin
            ratio.

        """
        X_membership = self.get_merlin_ratio(target_model, X_attack, y_attack)
        return X_membership