Source code for rankeval.analysis.statistical

# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Claudio Lucchese <claudio.lucchese@isti.cnr.it>, Cristina I. Muntean <cristina.muntean@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
This package implements several statistical significance tests.
"""

from __future__ import print_function

import math

import numpy as np
import xarray as xr

from ..dataset import Dataset
from ..metrics.metric import Metric
from rankeval.metrics import MSE

from ipywidgets import IntProgress
from IPython.display import display

[docs]def statistical_significance(datasets, model_a, model_b, metrics, n_perm=100000): """ This method computes the statistical significance of the performance difference between model_a and. Parameters ---------- datasets : list of Dataset The datasets to use for analyzing the behaviour of the model using the given metrics and models model_a : RTEnsemble The first model considered. model_b : RTEnsemble The second model considered. metrics : list of Metric The metrics to use for the analysis n_perm : int Number of permutations for the randomization test. Returns ------- stat_sig : xarray.DataArray A DataArray containing the statistical significance of the performance difference between any pair of models on the given dataset. """ progress_bar = IntProgress(min=0, max=len(datasets)*len(metrics), description="Iterating datasets and metrics") display(progress_bar) data = np.zeros(shape=(len(datasets), len(metrics), 2), dtype=np.float32) for idx_dataset, dataset in enumerate(datasets): y_pred_a = model_a.score(dataset, detailed=False) y_pred_b = model_b.score(dataset, detailed=False) for idx_metric, metric in enumerate(metrics): progress_bar.value += 1 metrics_a = metric.eval(dataset, y_pred_a)[1] metrics_b = metric.eval(dataset, y_pred_b)[1] p1, p2 = _randomization(metrics_a, metrics_b, n_perm=n_perm) data[idx_dataset][idx_metric][0] = p1 data[idx_dataset][idx_metric][1] = p2 progress_bar.bar_style = "success" progress_bar.close() performance = xr.DataArray(data, name='Statistical Significance', coords=[datasets, metrics, ['one-sided', 'two-sided']], dims=['dataset', 'metric', 'p-value']) return performance
def _randomization(metric_scores_a, metric_scores_b, n_perm=100000): """ This method computes the randomization test as described in [1]. Parameters ---------- metric_scores_a : numpy array Vector of per-query metric scores for the IR system A. metric_scores_b : numpy array Vector of per-query metric scores for the IR system B. n_perm : int Number of permutations evaluated in the randomization test. Returns ------- metric_scores : (float, float) A tuple (p-value_1, p-value_2) being respectively the one-sided and two-sided p-values. References ---------- .. [1] Smucker, Mark D., James Allan, and Ben Carterette. "A comparison of statistical significance tests for information retrieval evaluation." In Proceedings of the sixteenth ACM conference on Conference on information and knowledge management, pp. 623-632. ACM, 2007. """ progress_bar = IntProgress(min=0, max=10, description="Randomization Test") display(progress_bar) # find the best system metric_scores_a_mean = np.mean(metric_scores_a) metric_scores_b_mean = np.mean(metric_scores_b) best_metrics = metric_scores_a worst_metrics = metric_scores_b if metric_scores_a_mean < metric_scores_b_mean: best_metrics = metric_scores_b worst_metrics = metric_scores_a difference = np.mean(best_metrics) - np.mean(worst_metrics) abs_difference = np.abs(difference) p1 = 0.0 # one-sided p2 = 0.0 # two-sided N = float(len(metric_scores_a)) a_sum = np.sum(best_metrics) b_sum = np.sum(worst_metrics) # repeat n_prem times for i in range(n_perm): if i % (n_perm/10)==0: progress_bar.value+=1 # select a random subset sel = np.random.choice([False, True], len(metric_scores_a)) a_sel_sum = np.sum(best_metrics[sel]) b_sel_sum = np.sum(worst_metrics[sel]) # compute avg performance of randomized models a_mean = (a_sum - a_sel_sum + b_sel_sum) / N b_mean = (b_sum - b_sel_sum + a_sel_sum) / N # performance difference delta = a_mean - b_mean if delta >= difference: p1 += 1. if np.abs(delta) >= abs_difference: p2 += 1. progress_bar.bar_style = "success" progress_bar.close() p1 /= n_perm p2 /= n_perm return p1, p2 def _kfold_scoring(dataset, k, algo): """ Scored the given datset with the given algo unsing k-fold train/test. Parameters ---------- dataset : rankeval.dataset.Dataset The dataset instance. k : int Number of folds. algo : function See :func:`bias_variance`. Returns ------- score : numpy.ndarray A vecotr of num_instances scores. """ progress_bar = IntProgress(min=0, max=k, description="Processing k folds") display(progress_bar) scores = np.zeros(dataset.n_instances, dtype=np.float32) q_lens = dataset.query_ids[1:]-dataset.query_ids[:-1] # shuffle queries shuffled_qid = np.random.permutation(dataset.n_queries) chunk_size = int(math.ceil(dataset.n_queries/float(k))) for f,p in enumerate(range(0,dataset.n_queries,chunk_size)): progress_bar.value += 1 # p-th fold is used for testing test_rows = np.zeros(dataset.n_instances, dtype=np.bool) for q in shuffled_qid[p: p+chunk_size]: test_rows[ dataset.query_ids[q]:dataset.query_ids[q+1] ] = True # other folds are used for training train_rows = np.logical_not(test_rows) train_q = np.ones(dataset.n_queries, dtype=np.bool) train_q[shuffled_qid[p: p+chunk_size]] = False # get algorithm predictions fold_scores = algo(dataset.X[train_rows], dataset.y[train_rows], q_lens[train_q], dataset.X[test_rows]) # update scores for the current fold scores[test_rows] = fold_scores progress_bar.bar_style = "success" progress_bar.close() return scores def _multi_kfold_scoring(dataset, algo, L=10, k=2): """ Performs multiple scorings of the given dataset. Parameters ---------- dataset : rankeval.dataset.Dataset The dataset instance. algo : function See :func:`bias_variance`. L : int Number of iterations k : int Number of folds. Returns ------- score : numpy.ndarray A matrix num_instances x L. """ progress_bar = IntProgress(min=0, max=L, description="Computing L scores") display(progress_bar) scores = np.zeros( (dataset.n_instances, L), dtype=np.float32) for l in range(L): progress_bar.value += 1 scores[:,l] = _kfold_scoring(dataset, k, algo) progress_bar.bar_style = "success" progress_bar.close() return scores
[docs]def bias_variance(datasets=[], algos=[], metrics=[], L=10, k=2): """ This method computes the bias vs. variance decomposition of the error. The approach used here is based on the works of [Webb05]_ and [Dom05]_. Each instance of the dataset is scored `L` times. A single scoring is achieved by splitting the dataset at random into `k` folds. Each fold is scored by the model `M` trained on the remainder folds. [Webb05]_ recommends the use of 2 folds. If metric is MSE then the standard decomposition is used. The Bias for and instance `x` is defined as mean squared error of the `L` trained models w.r.t. the true label `y`, denoted with :math:`{\\sf E}_{L} [M(x) - y]^2`. The Variance for an instance `x` is measured across the `L` trained models: :math:`{\\sf E}_{L} [M(x) - {\\sf E}_{L} M(x)]^2`. Both are averaged over all instances in the dataset. If metric is any of the IR quality measures, we resort to the bias variance decomposition of the mean squared error of the given metric w.r.t. its ideal value, e.g., for the case of NDCG, :math:`{\\sf E}_{L} [1 - NDCG]^2`. Recall that, a formal Bias/Variance decomposition was not proposed yet. Parameters ---------- dataset : rankeval.dataset.Dataset The dataset instance. algo : function This should be a wrapper of learning algorithm. The function should accept four parameters: `train_X`, `train_Y`, `train_q`, `test_X`. - `train_X`: numpy.ndarray storing a 2-D matrix of size num_docs x num_features - `train_Y`: numpy.ndarray storing a vector of document's relevance labels - `train_q`: numpy.ndarray storing a vector of query lengths - `test_X`: numpy.ndarray as for `train_X` A model is trained on `train_X`, `train_Y`, `train_q`, and used to score `test_X`. An numpy.ndarray with such score must be returned. metric : "mse" or rankeval.metrics.metric.Metric The metric used to compute the error. L : int Number of iterations k : int Number of folds. Returns ------- bias_variance : xarray.DataArray A DataArray containing the bias/variance decomposition of the error for any given dataset, algorithm and metric. References ---------- .. [Webb05] Webb, Geoffrey I., and Paul Conilione. "Estimating bias and variance from data." Pre-publication manuscript (`pdf <http://www.csse.monash.edu/webb/-Files/WebbConilione06.pdf>`_) (2005). .. [Dom05] Domingos P. A unified bias-variance decomposition. In Proceedings of 17th International Conference on Machine Learning 2000 (pp. 231-238). """ assert(k>=2) assert(L>=2) assert(len(datasets)>0) assert(len(metrics)>0) for metric in metrics: assert isinstance(metric, Metric) progress_bar = IntProgress(min=0, max=len(datasets)*len(metrics)*len(algos), description="Iterating datasets and metrics") display(progress_bar) data = np.zeros(shape=(len(datasets), len(metrics), len(algos), 3), dtype=np.float32) for idx_dataset, dataset in enumerate(datasets): for idx_algo, algo in enumerate(algos): for idx_metric, metric in enumerate(metrics): progress_bar.value += 1 scores = _multi_kfold_scoring(dataset, algo=algo, L=L, k=k) avg_error = 0. avg_bias = 0. avg_var = 0. if not isinstance(metric, MSE): # mse over metric, assume error is 1-metric # not exactly domingos paper q_scores = np.empty((dataset.n_queries, L), dtype=np.float32) for i in range(L): q_scores[:,i] = metric.eval(dataset=dataset, y_pred=scores[:,i])[1] avg_error = np.mean( (q_scores-1.)**2. ) avg_pred = np.mean(q_scores, axis=1) avg_bias = np.mean((avg_pred - 1.)**2.) avg_var = np.mean( (q_scores-avg_pred.reshape((-1,1)))**2. ) else: # mse avg_error = np.mean( (scores-dataset.y.reshape((-1,1)))**2. ) avg_pred = np.mean(scores, axis=1) avg_bias = np.mean((avg_pred - dataset.y)**2.) avg_var = np.mean( (scores-avg_pred.reshape((-1,1)))**2. ) data[idx_dataset][idx_metric][idx_algo][0] = avg_error data[idx_dataset][idx_metric][idx_algo][1] = avg_bias data[idx_dataset][idx_metric][idx_algo][2] = avg_var progress_bar.bar_style = "success" progress_bar.close() performance = xr.DataArray(data, name='Bias/Variance Decomposition', coords=[datasets, metrics, [a.__name__ for a in algos], ['Error', 'Bias', 'Variance']], dims=['dataset', 'metric', 'algo', 'error']) return performance