Source code for annotlib.standard

import numpy as np

from annotlib.base import BaseAnnot
from annotlib.utils import check_indices, check_positive_integer, check_shape

from sklearn.utils import check_array, column_or_1d, check_random_state

from numpy_indexed import indices


[docs]class StandardAnnot(BaseAnnot):
    """StandardAnnot

    Standard annotators are represented by the class StandardAnnot, which enables to define
    arbitrary annotators. In a real-world scenario, an annotator is often a human who is asked to provide class labels
    for samples. An instance of the StandardAnnotators class aims at representing such a human within a
    Python environment.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)
        Samples of the whole data set.
    Y: array-like, shape (n_samples, n_annotators)
        Class labels of the given samples X.
    C: array-like, shape (n_samples)
        confidence score for labelling the given samples x.
    confidence_noise : array-like, shape (n_annotators)
        An entry of confidence_noise defines the interval from which the noise is uniformly drawn, e.g.
        confidence_noise[a] = 0.2 results in sampling n_samples times from U(-0.2, 0.2) and adding this noise
        to the confidence scores. Zero noise is the default value for each annotator.
    random_state: None | int | numpy.random.RandomState
            The random state used for generating class labels of the annotators.

    Attributes
    ----------
    X_: numpy.ndarray, shape (n_samples, n_features)
        Samples of the whole data set.
    Y_: numpy.ndarray, shape (n_samples, n_annotators)
        Class labels of the given samples X.
    C_: numpy.ndarray, shape (n_samples, n_annotators)
        confidence score for labelling the given samples x.
    C_noise_: numpy.ndarray, shape (n_samples, n_annotators)
        The uniformly noise for each annotator and each sample, e.g. C[x_idx, a_idx] indicates the noise for the
        confidence score of annotator with id a_idx in labelling sample with id x_idx.
    n_annotators_: int
        Number of annotators.
    n_queries_: numpy.ndarray, shape (n_annotators)
        An entry n_queries_[a_idx] indicates how many queries annotator with id a_idx has processed.
    queried_flags_: numpy.ndarray, shape (n_samples, n_annotators)
        An entry queried_flags_[a_idx, x_idx] is a boolean indicating whether annotator with id a_idx has provided a
        class label for sample with id x_idx.
    random_state_: None | int | numpy.random.RandomState
            The random state used for generating class labels of the annotators.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.datasets import load_iris
    >>> X, y_true = load_iris(return_X_y=True)
    >>> # generate confidence scores
    >>> C = np.ones((len(X), 1))
    >>> # annotator is always correct
    >>> Y = y_true.reshape(-1,1)
    >>> annotator = StandardAnnot(X=X, Y=Y, C=C)
    >>> # number of annotators
    >>> print(annotator.n_annotators())
    1
    >>> # labelling performance of annotator
    >>> print(annotator.labelling_performance(X=X, y_true=y_true))
    [1.0]
    """

    def __init__(self, X, Y, C=None, confidence_noise=None, random_state=None, probabilistic=False):

        # check samples, class labels and confidence scores
        self.X_, self.Y_, = check_array(X), check_array(Y, force_all_finite=False)
        C = np.full(self.Y_.shape, np.nan) if C is None else C
        self.C_ = check_shape(check_array(C, force_all_finite=False), self.Y_.shape, 'C')
        if np.size(X, 0) != np.size(self.Y_, 0):
            raise ValueError('The number of samples and class labels must be equal.')

        # check remaining attributes
        self._check_parameters(np.size(Y, 1), np.size(X, 0), confidence_noise, random_state)
        self._add_confidence_noise(probabilistic)

[docs]    def n_annotators(self):
        """Method returning the number of annotators.

        Returns
        -------
        n_annotators: int
            Number of BaseAnnot.
        """
        return self.n_annotators_

[docs]    def n_queries(self):
        """Method returning the number of queries posed to an annotator.

        Returns
        -------
        n_queries_: numpy.ndarray, shape (n_annotators)
            An entry n_queries_[a] indicates how many queries annotator a has processed.
        """
        return self.n_queries_

[docs]    def queried_samples(self):
        """Method returning the samples for which the annotators were queried to provide class labels.

        Returns
        -------
        X_queried: list, shape (n_annotators, n_samples, n_features)
            An entry X_queried_[a] represents the samples for which the annotator a was queried to provide class labels.
        """
        return [self.X_[self.queried_flags_[:, a]] for a in range(self.n_annotators())]

[docs]    def class_labels(self, X, annotator_ids=None, query_value=1, **kwargs):
        """Method returning the class labels of the given samples.
        If the query value is greater than zero, it updates the n_queries and queried sample statistics

        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
             Samples whose class labels are queried.
        annotator_ids: array-like, shape (n_queried_annotators)
            The indices of the annotators whose class labels are queried.
        query_value: int
            The query value represents the increment of the query statistics of the queried annotators.

        Returns
        -------
        Y: numpy.ndarray, shape (n_samples, n_annotators)
            Class labels of the given samples which were provided by the queried annotators.
            The non queried annotators return np.nan values.
        """
        # check annotator_ids
        annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids')

        # obtain ids of queried samples
        X = check_array(X)
        sample_ids = indices(self.X_, X, missing=-1)
        sample_ids_flag = sample_ids >= 0

        # class labels provided by queried annotators
        Y = np.full((np.size(X, 0), self.n_annotators()), np.nan)
        Y[sample_ids_flag, annotator_ids[:, None]] = self.Y_[sample_ids[sample_ids_flag], annotator_ids[:, None]]

        # update query statistics
        if query_value > 0:
            self.queried_flags_[sample_ids, annotator_ids[:, None]] = True
            self.n_queries_[annotator_ids] += query_value

        return Y

[docs]    def confidence_scores(self, X, annotator_ids=None, **kwargs):
        """Method returning the confidence scores for labelling the given samples.

        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
             Samples whose class labels are queried.
        annotator_ids: array-like, shape (n_queried_annotators)
            The indices of the annotators whose confidence scores are queried.

        Returns
        -------
        C: numpy.ndarray, shape (n_samples, n_annotators)
            confidence scores of the queried annotators for labelling the given samples.
            The non queried annotators should return np.nan values.
        """
        # check annotator_ids
        annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids')

        # obtain ids of queried samples
        X = check_array(X)
        sample_ids = indices(self.X_, X, missing=-1)
        sample_ids_flag = sample_ids >= 0

        # confidence scores provided by queried annotators
        C = np.full((np.size(X, 0), self.n_annotators()), np.nan)
        C[sample_ids_flag, annotator_ids[:, None]] = self.C_[sample_ids[sample_ids_flag], annotator_ids[:, None]]

        return C

    def _check_parameters(self, n_annotators, n_samples, confidence_noise, random_state):
        """
        This method is responsible for checking several parameters and to set them as attributes.

        Parameters
        ----------
        n_annotators: int
            Number of annotators.
        n_samples: int
            Number of samples.
        confidence_noise: array-like, shape (n_samples)
            Noise of the confidence scores of each annotator.
        random_state: None | int | instance of :py:class:`numpy.random.RandomState`
            The random state used for generating class labels of the annotators.
        """
        self.n_annotators_ = check_positive_integer(n_annotators, parameter_name='n_annotators')
        self.n_queries_ = column_or_1d(np.asarray([0] * self.n_annotators()))
        self.queried_flags_ = np.zeros((n_samples, n_annotators), dtype=bool)

        # check confidence noise
        self.confidence_noise_ = np.zeros(self.n_annotators()) if confidence_noise is None else confidence_noise
        self.confidence_noise_ = column_or_1d(self.confidence_noise_)

        if len(self.confidence_noise_) != self.n_annotators():
            raise ValueError('The number of elements in `confidence_noise` must be a equal to the number of annotators.')

        # check random state
        self.random_state_ = check_random_state(random_state)

        # add confidence noise
        self.C_noise_ = np.asarray(
            [self.random_state_.uniform(-self.confidence_noise_[a], self.confidence_noise_[a], n_samples) for a in
             range(self.n_annotators())]).T

    def _add_confidence_noise(self, probabilistic=False):
        """
        Add the uniform confidence noise to the confidence scores.

        Parameters
        ----------
        probabilistic: boolean
            If true, the confidence scores are in the interval [0, 1].
        """
        # adjust confidence values
        self.C_ += self.C_noise_
        if probabilistic:
            self.C_[self.C_ > 1] = 1
            self.C_[self.C_ < 0] = 0