from abc import ABC
from typing import (
List,
Tuple,
Dict,
Union,
Any,
Optional,
Sized,
Iterable,
Container,
Generic,
TypeVar,
)
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import pandas as pd
import logging
from epic_kitchens.meta import (
action_id_from_verb_noun,
action_tuples_to_ids,
ActionClass,
)
from epic_kitchens.scoring import compute_action_scores, scores_dict_to_ranks
from . import meta
T = TypeVar("T")
LOG = logging.getLogger(__name__)
Metric = str
Task = str
MetricsDict = Dict[Metric, Any]
[docs]def compute_metrics(
groundtruth_df: pd.DataFrame,
scores: Dict[str, Union[np.ndarray, Dict[int, float]]],
many_shot_verbs: Optional[np.ndarray] = None,
many_shot_nouns: Optional[np.ndarray] = None,
many_shot_actions: Optional[np.ndarray] = None,
action_priors: Optional[np.ndarray] = None,
) -> MetricsDict:
"""Compute the EPIC action recognition evaluation metrics from ``scores`` given
ground truth labels in ``groundtruth_df``.
Args:
groundtruth_df:
DataFrame containing ``verb_class``: :py:class:`int`,
``noun_class``: :py:class:`int`. This function will add an
``action_class`` column containing the action ID obtained from
:py:func:`epic_kitchens.meta.action_id_from_verb_noun`.
scores:
Dictionary containing: ``'verb'``, ``'noun'`` and (optionally) ``'action'`` entries.
``'verb'`` and ``'noun'`` should map to a 2D :py:class:`np.ndarray` of shape
``(n_instances, n_classes)`` where each element is the predicted score of
that class. ``'action'`` should map to a dictionary of action keys to
scores. The order of the scores array should be the same as the order in
``groundtruth_df``.
many_shot_verbs:
The set of verb classes that are considered many shot. If not provided
they are loaded from :py:func:`epic_kitchens.meta.many_shot_verbs`
many_shot_nouns:
The set of noun classes that are considered many shot. If not provided
they are loaded from :py:func:`epic_kitchens.meta.many_shot_nouns`
many_shot_actions:
The set of action classes that are considered many shot. If not provided
they are loaded from :py:func:`epic_kitchens.meta.many_shot_actions`
action_priors:
A ``(n_verbs, n_nouns)`` shaped array containing the action prior used to
weight action predictions.
Returns:
A dictionary containing all metrics with the following structure::
accuracy:
verb: list[float, length 2]
noun: list[float, length 2]
action: list[float, length 2]
precision:
verb: float
noun: float
action: float
recall:
verb: float
noun: float
action: float
Accuracy lists contain the top-k metrics like so ``[top_1, top_5]``,
the precision and recall metrics are macro averaged and computed over the
many-shot classes.
Raises:
ValueError
If the shapes of the ``scores`` arrays are not correct, or the lengths of
``groundtruth_df`` and the ``scores`` arrays are not equal, or if
``grountruth_df`` doesn't have the specified columns.
"""
if many_shot_verbs is None:
many_shot_verbs = np.array(list(meta.many_shot_verbs()))
if many_shot_nouns is None:
many_shot_nouns = np.array(list(meta.many_shot_nouns()))
if many_shot_actions is None:
many_shot_action_ids = np.array(action_tuples_to_ids(meta.many_shot_actions()))
else:
many_shot_action_ids = np.array(action_tuples_to_ids(many_shot_actions))
for entry in "verb", "noun":
class_col = entry + "_class"
if class_col not in groundtruth_df.columns:
raise ValueError("Expected '{}' column in groundtruth_df".format(class_col))
groundtruth_df["action_class"] = action_id_from_verb_noun(
groundtruth_df["verb_class"], groundtruth_df["noun_class"]
)
if "action" not in scores:
(clip_verbs, clip_nouns), clip_scores = compute_action_scores(
scores["verb"], scores["noun"], top_k=100, action_priors=action_priors
)
scores["action"] = [
{
action_id_from_verb_noun(verb, noun): score
for verb, noun, score in zip(verbs, nouns, scores)
}
for verbs, nouns, scores in zip(clip_verbs, clip_nouns, clip_scores)
]
ranks = scores_dict_to_ranks(scores)
top_k = (1, 5)
accuracies = compute_class_aware_metrics(groundtruth_df, ranks, top_k)
precision_recall_metrics = compute_class_agnostic_metrics(
groundtruth_df, ranks, many_shot_verbs, many_shot_nouns, many_shot_action_ids
)
return {
"accuracy": {
"verb": accuracies["verb"],
"noun": accuracies["noun"],
"action": accuracies["action"],
},
**precision_recall_metrics,
}
[docs]def compute_class_aware_metrics(
groundtruth_df: pd.DataFrame,
ranks: Dict[str, np.ndarray],
top_k: Union[int, Tuple[int, ...]] = (1, 5),
) -> Dict[str, Union[float, Union[float, List[float]]]]:
"""Compute class aware metrics (accuracy @ 1/5) from ranks.
Args:
groundtruth_df:
DataFrame containing ``'verb_class'``: :py:class:`int`, ``'noun_class'``:
:py:class:`int` and ``'action_class'``: :py:class:`int` columns.
ranks:
Dictionary containing three entries: ``'verb'``, ``'noun'`` and
``'action'``. Entries should map to a 2D :py:class:`np.ndarray`
of shape ``(n_instances, n_classes)`` where the index is the predicted
rank of the class at that index.
top_k:
The set of k values to compute top-k accuracy for.
Returns:
Dictionary with the structure::
verb: list[float, length = len(top_k)]
noun: list[float, length = len(top_k)]
action: list[float, length = len(top_k)]
"""
verb_accuracies = topk_accuracy(
ranks["verb"], groundtruth_df["verb_class"].values, ks=top_k
)
noun_accuracies = topk_accuracy(
ranks["noun"], groundtruth_df["noun_class"].values, ks=top_k
)
action_accuracies = topk_accuracy(
ranks["action"], groundtruth_df["action_class"].values
)
return {
"verb": verb_accuracies,
"noun": noun_accuracies,
"action": action_accuracies,
}
[docs]def compute_class_agnostic_metrics(
groundtruth_df: pd.DataFrame,
ranks: Dict[str, np.ndarray],
many_shot_verbs: Optional[np.ndarray] = None,
many_shot_nouns: Optional[np.ndarray] = None,
many_shot_actions: Optional[np.ndarray] = None,
) -> Dict[Metric, Dict[Task, Union[np.float, Dict[str, np.float]]]]:
"""
Compute class agnostic metrics (many-shot precision and recall) from ranks.
Args:
groundtruth_df:
DataFrame containing ``'verb_class'``: :py:class:`int`,
``'noun_class'``: :py:class:`int` and ``'action_class'``: :py:class:`int`
columns.
ranks:
Dictionary containing three entries: ``'verb'``, ``'noun'`` and
``'action'``. Entries should map to a 2D :py:class:`np.ndarray`
of shape ``(n_instances, n_classes)`` where the index is the predicted
rank of the class at that index.
many_shot_verbs:
The set of verb classes that are considered many shot. If not provided
they are loaded from :py:func:`epic_kitchens.meta.many_shot_verbs`
many_shot_nouns:
The set of noun classes that are considered many shot. If not provided
they are loaded from :py:func:`epic_kitchens.meta.many_shot_nouns`
many_shot_actions:
The set of action classes that are considered many shot. If not provided
they are loaded from :py:func:`epic_kitchens.meta.many_shot_actions`
Returns:
Dictionary with the structure::
precision:
verb: float
noun: float
action: float
verb_per_class: dict[str:float, length = n_verbs]
recall:
verb: float
noun: float
action: float
verb_per_class: dict[str:float, length = n_verbs]
The ``'verb'``, ``'noun'``, and ``'action'`` entries of the metric dictionaries
are the macro-averaged mean precision/recall over the set of many shot classes,
whereas the 'verb_per_class' entry is a breakdown for each verb_class in the
format of a dictionary mapping stringified verb class to that class'
precision/recall.
"""
if many_shot_verbs is None:
many_shot_verbs = np.array(list(meta.many_shot_verbs()))
if many_shot_nouns is None:
many_shot_nouns = np.array(list(meta.many_shot_nouns()))
if many_shot_actions is None:
many_shot_actions = np.array(action_tuples_to_ids(meta.many_shot_actions()))
many_shot_verbs = _exclude_non_existent_classes(
many_shot_verbs, groundtruth_df["verb_class"]
)
many_shot_nouns = _exclude_non_existent_classes(
many_shot_nouns, groundtruth_df["noun_class"]
)
many_shot_actions = _exclude_non_existent_classes(
many_shot_actions, groundtruth_df["action_class"]
)
verb_precision, verb_recall = precision_recall(
ranks["verb"], groundtruth_df.verb_class, classes=many_shot_verbs
)
noun_precision, noun_recall = precision_recall(
ranks["noun"], groundtruth_df.noun_class, classes=many_shot_nouns
)
LOG.debug(
"{} many shot actions before intersecting with actions present in test".format(
len(many_shot_actions)
)
)
LOG.info(
"{} many shot actions after intersecting with actions present in test".format(
len(many_shot_actions)
)
)
action_precision, action_recall = precision_recall(
ranks["action"], groundtruth_df["action_class"], classes=many_shot_actions
)
precision_many_shot_verbs = {
str(verb): score for verb, score in zip(many_shot_verbs, verb_precision)
}
recall_many_shot_verbs = {
str(verb): score for verb, score in zip(many_shot_verbs, verb_recall)
}
return {
"precision": {
"action": action_precision.mean(),
"verb": verb_precision.mean(),
"noun": noun_precision.mean(),
"verb_per_class": precision_many_shot_verbs,
},
"recall": {
"action": action_recall.mean(),
"verb": verb_recall.mean(),
"noun": noun_recall.mean(),
"verb_per_class": recall_many_shot_verbs,
},
}
[docs]def topk_accuracy(
rankings: np.ndarray, labels: np.ndarray, ks: Union[Tuple[int, ...], int] = (1, 5)
) -> Union[float, List[float]]:
"""Computes top-k accuracies for different values of k from rankings.
Args:
rankings: 2D rankings array ``(n_instances, n_classes)``
labels: 1D correct labels array ``(n_instances,)``
ks: The k values in top-k
Returns:
Top-k accuracy for each ``k`` in ``ks``. If only one ``k`` is provided,
then only a single float is returned.
Raises:
ValueError
If the dimensionality of the ``rankings`` or ``labels`` is incorrect, or
if the length of ``rankings`` and ``labels`` aren't equal.
"""
if isinstance(ks, int):
ks = (ks,)
_check_label_predictions_preconditions(rankings, labels)
# trim to max k to avoid extra computation
maxk = np.max(ks)
# compute true positives in the top-maxk predictions
tp = rankings[:, :maxk] == labels.reshape(-1, 1)
# trim to selected ks and compute accuracies
accuracies = [tp[:, :k].max(1).mean() for k in ks]
if len(accuracies) == 1:
return accuracies[0]
else:
return accuracies
[docs]def precision_recall(
rankings: np.ndarray, labels: np.ndarray, classes: Optional[np.ndarray] = None
) -> Tuple[np.ndarray, np.ndarray]:
"""Computes precision and recall from rankings.
Args:
rankings: 2D array of shape ``(n_instances, n_classes)``
labels: 1D array of shape = ``(n_instances,)``
classes: Iterable of classes to compute the metrics over.
Returns:
Tuple of ``(precision, recall)`` where
``precision`` is a 1D array of shape ``(len(classes),)``, and
``recall`` is a 1D array of shape ``(len(classes),)``
Raises:
ValueError
If the dimensionality of the ``rankings`` or ``labels`` is incorrect, or if
the length of the ``rankings`` and ``labels`` are not equal, or if the set
of the provided ``classes`` is not a subset of the classes present in
``labels``.
"""
_check_label_predictions_preconditions(rankings, labels)
y_pred = rankings[:, 0]
if classes is None:
classes = np.unique(labels)
else:
provided_class_presence = np.in1d(classes, np.unique(labels))
if not np.all(provided_class_presence):
raise ValueError(
"Classes {} are not in labels".format(classes[provided_class_presence])
)
precision, recall, _, _ = precision_recall_fscore_support(
labels, y_pred, labels=classes, average=None, warn_for=tuple("recall")
)
return precision, recall
def _exclude_non_existent_classes(classes: np.ndarray, labels: pd.Series) -> np.ndarray:
return np.intersect1d(classes, labels.unique())
def _check_label_predictions_preconditions(
rankings: np.ndarray, labels: np.ndarray
) -> None:
if not rankings.ndim == 2:
raise ValueError(
"Rankings should be a 2D matrix, but was {}D".format(rankings.ndim)
)
if not labels.ndim == 1:
raise ValueError("Labels should be a 1D vector but was {}D".format(labels.ndim))
if not labels.shape[0] == rankings.shape[0]:
raise ValueError(
"Number of labels provided does not match number of predictions"
)