import numpy as np
from annotlib.base import BaseAnnot
from annotlib.utils import check_indices, check_positive_integer, check_shape
from sklearn.utils import check_array, column_or_1d, check_random_state
from numpy_indexed import indices
[docs]class StandardAnnot(BaseAnnot):
"""StandardAnnot
Standard annotators are represented by the class StandardAnnot, which enables to define
arbitrary annotators. In a real-world scenario, an annotator is often a human who is asked to provide class labels
for samples. An instance of the StandardAnnotators class aims at representing such a human within a
Python environment.
Parameters
----------
X: array-like, shape (n_samples, n_features)
Samples of the whole data set.
Y: array-like, shape (n_samples, n_annotators)
Class labels of the given samples X.
C: array-like, shape (n_samples)
confidence score for labelling the given samples x.
confidence_noise : array-like, shape (n_annotators)
An entry of confidence_noise defines the interval from which the noise is uniformly drawn, e.g.
confidence_noise[a] = 0.2 results in sampling n_samples times from U(-0.2, 0.2) and adding this noise
to the confidence scores. Zero noise is the default value for each annotator.
random_state: None | int | numpy.random.RandomState
The random state used for generating class labels of the annotators.
Attributes
----------
X_: numpy.ndarray, shape (n_samples, n_features)
Samples of the whole data set.
Y_: numpy.ndarray, shape (n_samples, n_annotators)
Class labels of the given samples X.
C_: numpy.ndarray, shape (n_samples, n_annotators)
confidence score for labelling the given samples x.
C_noise_: numpy.ndarray, shape (n_samples, n_annotators)
The uniformly noise for each annotator and each sample, e.g. C[x_idx, a_idx] indicates the noise for the
confidence score of annotator with id a_idx in labelling sample with id x_idx.
n_annotators_: int
Number of annotators.
n_queries_: numpy.ndarray, shape (n_annotators)
An entry n_queries_[a_idx] indicates how many queries annotator with id a_idx has processed.
queried_flags_: numpy.ndarray, shape (n_samples, n_annotators)
An entry queried_flags_[a_idx, x_idx] is a boolean indicating whether annotator with id a_idx has provided a
class label for sample with id x_idx.
random_state_: None | int | numpy.random.RandomState
The random state used for generating class labels of the annotators.
Examples
--------
>>> import numpy as np
>>> from sklearn.datasets import load_iris
>>> X, y_true = load_iris(return_X_y=True)
>>> # generate confidence scores
>>> C = np.ones((len(X), 1))
>>> # annotator is always correct
>>> Y = y_true.reshape(-1,1)
>>> annotator = StandardAnnot(X=X, Y=Y, C=C)
>>> # number of annotators
>>> print(annotator.n_annotators())
1
>>> # labelling performance of annotator
>>> print(annotator.labelling_performance(X=X, y_true=y_true))
[1.0]
"""
def __init__(self, X, Y, C=None, confidence_noise=None, random_state=None, probabilistic=False):
# check samples, class labels and confidence scores
self.X_, self.Y_, = check_array(X), check_array(Y, force_all_finite=False)
C = np.full(self.Y_.shape, np.nan) if C is None else C
self.C_ = check_shape(check_array(C, force_all_finite=False), self.Y_.shape, 'C')
if np.size(X, 0) != np.size(self.Y_, 0):
raise ValueError('The number of samples and class labels must be equal.')
# check remaining attributes
self._check_parameters(np.size(Y, 1), np.size(X, 0), confidence_noise, random_state)
self._add_confidence_noise(probabilistic)
[docs] def n_annotators(self):
"""Method returning the number of annotators.
Returns
-------
n_annotators: int
Number of BaseAnnot.
"""
return self.n_annotators_
[docs] def n_queries(self):
"""Method returning the number of queries posed to an annotator.
Returns
-------
n_queries_: numpy.ndarray, shape (n_annotators)
An entry n_queries_[a] indicates how many queries annotator a has processed.
"""
return self.n_queries_
[docs] def queried_samples(self):
"""Method returning the samples for which the annotators were queried to provide class labels.
Returns
-------
X_queried: list, shape (n_annotators, n_samples, n_features)
An entry X_queried_[a] represents the samples for which the annotator a was queried to provide class labels.
"""
return [self.X_[self.queried_flags_[:, a]] for a in range(self.n_annotators())]
[docs] def class_labels(self, X, annotator_ids=None, query_value=1, **kwargs):
"""Method returning the class labels of the given samples.
If the query value is greater than zero, it updates the n_queries and queried sample statistics
Parameters
----------
X: array-like, shape (n_samples, n_features)
Samples whose class labels are queried.
annotator_ids: array-like, shape (n_queried_annotators)
The indices of the annotators whose class labels are queried.
query_value: int
The query value represents the increment of the query statistics of the queried annotators.
Returns
-------
Y: numpy.ndarray, shape (n_samples, n_annotators)
Class labels of the given samples which were provided by the queried annotators.
The non queried annotators return np.nan values.
"""
# check annotator_ids
annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids')
# obtain ids of queried samples
X = check_array(X)
sample_ids = indices(self.X_, X, missing=-1)
sample_ids_flag = sample_ids >= 0
# class labels provided by queried annotators
Y = np.full((np.size(X, 0), self.n_annotators()), np.nan)
Y[sample_ids_flag, annotator_ids[:, None]] = self.Y_[sample_ids[sample_ids_flag], annotator_ids[:, None]]
# update query statistics
if query_value > 0:
self.queried_flags_[sample_ids, annotator_ids[:, None]] = True
self.n_queries_[annotator_ids] += query_value
return Y
[docs] def confidence_scores(self, X, annotator_ids=None, **kwargs):
"""Method returning the confidence scores for labelling the given samples.
Parameters
----------
X: array-like, shape (n_samples, n_features)
Samples whose class labels are queried.
annotator_ids: array-like, shape (n_queried_annotators)
The indices of the annotators whose confidence scores are queried.
Returns
-------
C: numpy.ndarray, shape (n_samples, n_annotators)
confidence scores of the queried annotators for labelling the given samples.
The non queried annotators should return np.nan values.
"""
# check annotator_ids
annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids')
# obtain ids of queried samples
X = check_array(X)
sample_ids = indices(self.X_, X, missing=-1)
sample_ids_flag = sample_ids >= 0
# confidence scores provided by queried annotators
C = np.full((np.size(X, 0), self.n_annotators()), np.nan)
C[sample_ids_flag, annotator_ids[:, None]] = self.C_[sample_ids[sample_ids_flag], annotator_ids[:, None]]
return C
def _check_parameters(self, n_annotators, n_samples, confidence_noise, random_state):
"""
This method is responsible for checking several parameters and to set them as attributes.
Parameters
----------
n_annotators: int
Number of annotators.
n_samples: int
Number of samples.
confidence_noise: array-like, shape (n_samples)
Noise of the confidence scores of each annotator.
random_state: None | int | instance of :py:class:`numpy.random.RandomState`
The random state used for generating class labels of the annotators.
"""
self.n_annotators_ = check_positive_integer(n_annotators, parameter_name='n_annotators')
self.n_queries_ = column_or_1d(np.asarray([0] * self.n_annotators()))
self.queried_flags_ = np.zeros((n_samples, n_annotators), dtype=bool)
# check confidence noise
self.confidence_noise_ = np.zeros(self.n_annotators()) if confidence_noise is None else confidence_noise
self.confidence_noise_ = column_or_1d(self.confidence_noise_)
if len(self.confidence_noise_) != self.n_annotators():
raise ValueError('The number of elements in `confidence_noise` must be a equal to the number of annotators.')
# check random state
self.random_state_ = check_random_state(random_state)
# add confidence noise
self.C_noise_ = np.asarray(
[self.random_state_.uniform(-self.confidence_noise_[a], self.confidence_noise_[a], n_samples) for a in
range(self.n_annotators())]).T
def _add_confidence_noise(self, probabilistic=False):
"""
Add the uniform confidence noise to the confidence scores.
Parameters
----------
probabilistic: boolean
If true, the confidence scores are in the interval [0, 1].
"""
# adjust confidence values
self.C_ += self.C_noise_
if probabilistic:
self.C_[self.C_ > 1] = 1
self.C_[self.C_ < 0] = 0