Source code for rankeval.analysis.topological

# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors:  Salvatore Trani <salvatore.trani@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.


"""
This package implements several topological analysis focused on the
topological characteristics of ensemble-based LtR models. These
functionalities can be applied to several models,
so as to have a direct comparison of the shape of the resulting
forests (e.g., trained by different LtR algorithms).
"""

import numpy as np
import scipy.stats

from ..model import RTEnsemble
from _efficient_topological import efficient_topological_analysis


[docs]def topological_analysis(model, include_leaves=True): """ This method implements the topological analysis of a ensemble-based LtR model. Given a model, it studies the shape of each tree composing the model and return several information useful for having insights about the shape of the trees, their completeness (level by level) as well as min/max/mean height and the fraction of trees having a specific node (where each node is identified by a pair of coordinates row-col, with row highlighting the depth and col the column with respect to a full binary tree). Parameters ---------- model : RTEnsemble The model to analyze include_leaves : bool Whether the leaves has to be included in the analysis or not Returns ------- object : TopologicalAnalysisResult The topological result, to use for retrieving several information """ return TopologicalAnalysisResult(model, include_leaves)
[docs]class TopologicalAnalysisResult(object): """ This class is used to return the topological analysis made on the model. Several low-level information are stored in this class, and then re-elaborated to provide high-level analysis. """ def __init__(self, model, include_leaves): """ Analyze the model in a topological perspective Parameters ---------- model : RTEnsemble the model to analyze from the topological perspective include_leaves : bool Whether the leaves has to be included in the analysis or not Attributes ---------- model : RTEnsemble The model analyzed height_trees : numpy array The ordered height of each trees composing the ensemble topology : scipy.sparse.csr_matrix The matrix used to store low-level information related to the aggregated shape of the trees. Each matrix cell identifies a tree node with a pair of coordinates row-col, with row highlighting the depth and col the column with respect to a full binary tree. """ self.model = model self.topology, self.height_trees = efficient_topological_analysis(model, include_leaves)
[docs] def describe_tree_height(self): """ Computes several descriptive statistics of the height of the trees. Returns ------- nobs : int Number of trees minmax: tuple of ndarrays or floats Minimum and maximum height of trees mean : ndarray or float Arithmetic mean of tree heights. variance : ndarray or float Unbiased variance of the tree heights. denominator is number of trees minus one. skewness : ndarray or float Skewness, based on moment calculations with denominator equal to the number of trees, i.e. no degrees of freedom correction. kurtosis : ndarray or float Kurtosis (Fisher). The kurtosis is normalized so that it is zero for the normal distribution. No degrees of freedom are used. """ return scipy.stats.describe(self.height_trees)
[docs] def avg_tree_shape(self): """ Computes the fraction of trees having each node with respect to a full binary tree. The fraction is obtained by normalizing the count by the number of trees composing the ensemble model. Returns ------- fractions : scipy.sparse.csr_matrix Sparse matrix with the same shape of the topology matrix, where each matrix cell identifies a tree node by a pair of coordinates row-col, with row highlighting the depth and col the column with respect to a full binary tree. Each cell value highlights how many trees have the specific node, normalized by the number of trees. """ return self.topology / self.model.n_trees
[docs] def fullness_per_level(self): """ Computes the normalized number of trees with full level i, for each level of a full binary tree. The normalization is done by the number of trees. Returns ------- fullness : np.array An array long as the maximum height of a tree in the ensemble, and where the j-th cell highlight how much the j-th level of the trees is full (normalized by the number of trees). """ # Row-sums are directly supported, and the structure of the CSR format means that # the difference between successive values in the indptr array correspond exactly # to the number of nonzero elements in each row. sums = self.topology.sum(axis=1).A1 counts = np.diff(self.topology.indptr) return sums / counts / self.model.n_trees