Source code for guardian_ai.privacy_estimation.combined_attacks

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
from sklearn.base import BaseEstimator

from guardian_ai.privacy_estimation.attack import (
    AttackType,
    BlackBoxAttack,
    ConfidenceBasedBlackBoxAttack,
    LossBasedBlackBoxAttack,
)
from guardian_ai.privacy_estimation.merlin_attack import MerlinAttack
from guardian_ai.privacy_estimation.model import TargetModel
from guardian_ai.privacy_estimation.utils import log_loss_vector



[docs]
class CombinedBlackBoxAttack(BlackBoxAttack):
    """
    Similar in spirit to the Morgan attack, which combines loss and the merlin ratio.
    In this attack, we combine loss, and confidence values and instead of tuning the
    thresholds, we combine them using a trained classifier, like stacking.
    """

    def __init__(
        self,
        attack_model: BaseEstimator,
        loss_attack: LossBasedBlackBoxAttack = None,
        confidence_attack: ConfidenceBasedBlackBoxAttack = None,
    ):
        """
        Initialize CombinedBlackBoxAttack.

        Parameters
        ----------
        attack_model: sklearn.base.BaseEstimator
        loss_attack: guardian_ai.privacy_estimation.attack.LossBasedBlackBoxAttack
        confidence_attack: guardian_ai.privacy_estimation.attack.ConfidenceBasedBlackBoxAttack

        """
        self.loss_attack = loss_attack
        self.confidence_attack = confidence_attack
        super(CombinedBlackBoxAttack, self).__init__(
            attack_model, name=AttackType.CombinedBlackBoxAttack.name
        )


[docs]
    def transform_attack_data(
        self,
        target_model: TargetModel,
        X_attack,
        y_attack,
        split_type: str = None,
        use_cache=False,
    ):
        """
        Overriding the method transform_attack_data from the base class.
        Calculates the  per instance loss and confidence.

        Parameters
        ----------
        target_model: guardian_ai.privacy_estimation.model.TargetModel
            Target model being attacked.
        X_attack: {array-like, sparse matrix} of shape (n_samples, n_features)
            Input features of the attack datapoints, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.
        y_attack: ndarray of shape (n_samples,)
            Vector containing the  output labels of the attack data points (not membership label).
        split_type: str
            Use information cached from running the loss based and merlin attacks
        use_cache: bool
            Using the cache or not

        Returns
        -------
        X_membership:  {array-like, sparse matrix} of shape (n_samples, n_features),
            where ``n_samples`` is the number of samples and ``n_features`` is
            the number of features.
            Input feature for the attack model - in this case,
            per-instance loss and confidence values

        """
        if use_cache:
            if split_type == "train":
                my_per_instance_loss = self.loss_attack.X_membership_train
                my_confidence = self.confidence_attack.X_membership_train
            elif split_type == "test":
                my_per_instance_loss = self.loss_attack.X_membership_test
                my_confidence = self.confidence_attack.X_membership_test
            else:
                raise Exception("split type specified is not cached")
        else:
            labels = target_model.model.classes_
            probs = target_model.get_prediction_probs(X_attack)
            my_per_instance_loss = -log_loss_vector(y_attack, probs, labels=labels)
            my_confidence = np.max(probs, 1)
        X_membership = np.column_stack((my_per_instance_loss, my_confidence))
        return X_membership




class CombinedWithMerlinBlackBoxAttack(BlackBoxAttack):
    """
    Similar in spirit to the Morgan attack, which combines loss and the merlin ratio.
    In this attack, we combine loss,  confidence values and merlin ratio,
    and instead of tuning the thresholds, we combine them using
    a trained classifier, like stacking.
    """

    def __init__(
        self,
        attack_model: BaseEstimator,
        merlin_attack: MerlinAttack,  # this must be passed
        loss_attack: LossBasedBlackBoxAttack = None,
        confidence_attack: ConfidenceBasedBlackBoxAttack = None,
    ):
        self.merlin_attack = merlin_attack
        self.loss_attack = loss_attack
        self.confidence_attack = confidence_attack
        super(CombinedWithMerlinBlackBoxAttack, self).__init__(
            attack_model, name=AttackType.CombinedWithMerlinBlackBoxAttack.name
        )

    def transform_attack_data(
        self,
        target_model: TargetModel,
        X_attack,
        y_attack,
        split_type: str = None,
        use_cache: bool = False,
    ):
        """
        Overriding the method transform_attack_data from the base class.
        Calculates the Merlin ratio, and combines it with per instance loss and confidence

        Parameters
        ----------
        target_model: guardian_ai.privacy_estimation.model.TargetModel
            Target model being attacked.
        X_attack: {array-like, sparse matrix} of shape (n_samples, n_features)
            Input features of the attack datapoints, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.
        y_attack: ndarray of shape (n_samples,)
            Vector containing the  output labels of the attack data points (not membership label).
        split_type: str
            Use information cached from running the loss based and merlin attacks
        use_cache: bool
            Using the cache or not

        Returns
        -------
        X_membership:  {array-like, sparse matrix} of shape (n_samples, n_features),
            where ``n_samples`` is the number of samples and ``n_features`` is
            the number of features.
            Input feature for the attack model - in this case the Merlin
            ratio, per-instance loss and confidence values.

        """
        if use_cache:
            if split_type == "train":
                my_per_instance_loss = self.loss_attack.X_membership_train
                my_confidence = self.confidence_attack.X_membership_train
                merlin_ratio = self.merlin_attack.X_membership_train
            elif split_type == "test":
                my_per_instance_loss = self.loss_attack.X_membership_test
                my_confidence = self.confidence_attack.X_membership_test
                merlin_ratio = self.merlin_attack.X_membership_test
            else:
                raise Exception("split type specified is not cached")
        else:
            labels = target_model.model.classes_
            probs = target_model.get_prediction_probs(X_attack)
            my_per_instance_loss = -log_loss_vector(y_attack, probs, labels=labels)
            my_confidence = np.max(probs, 1)
            merlin_ratio = self.merlin_attack.get_merlin_ratio(target_model, X_attack, y_attack)
        X_membership = np.column_stack((my_per_instance_loss, my_confidence, merlin_ratio))
        return X_membership