# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
This package implements several topological analysis focused on the
topological characteristics of ensemble-based LtR models. These
functionalities can be applied to several models,
so as to have a direct comparison of the shape of the resulting
forests (e.g., trained by different LtR algorithms).
"""
import numpy as np
import scipy.stats
from ..model import RTEnsemble
from _efficient_topological import efficient_topological_analysis
[docs]def topological_analysis(model, include_leaves=True):
"""
This method implements the topological analysis of a ensemble-based
LtR model. Given a model, it studies the shape of each tree composing
the model and return several information useful for having insights
about the shape of the trees, their completeness (level by level) as
well as min/max/mean height and the fraction of trees having a specific
node (where each node is identified by a pair of coordinates row-col,
with row highlighting the depth and col the column with respect to a
full binary tree).
Parameters
----------
model : RTEnsemble
The model to analyze
include_leaves : bool
Whether the leaves has to be included in the analysis or not
Returns
-------
object : TopologicalAnalysisResult
The topological result, to use for retrieving several information
"""
return TopologicalAnalysisResult(model, include_leaves)
[docs]class TopologicalAnalysisResult(object):
"""
This class is used to return the topological analysis made on the model.
Several low-level information are stored in this class, and then
re-elaborated to provide high-level analysis.
"""
def __init__(self, model, include_leaves):
"""
Analyze the model in a topological perspective
Parameters
----------
model : RTEnsemble
the model to analyze from the topological perspective
include_leaves : bool
Whether the leaves has to be included in the analysis or not
Attributes
----------
model : RTEnsemble
The model analyzed
height_trees : numpy array
The ordered height of each trees composing the ensemble
topology : scipy.sparse.csr_matrix
The matrix used to store low-level information related to the
aggregated shape of the trees. Each matrix cell identifies a
tree node with a pair of coordinates row-col, with row
highlighting the depth and col the column with respect
to a full binary tree.
"""
self.model = model
self.topology, self.height_trees = efficient_topological_analysis(model, include_leaves)
[docs] def describe_tree_height(self):
"""
Computes several descriptive statistics of the height of the trees.
Returns
-------
nobs : int
Number of trees
minmax: tuple of ndarrays or floats
Minimum and maximum height of trees
mean : ndarray or float
Arithmetic mean of tree heights.
variance : ndarray or float
Unbiased variance of the tree heights.
denominator is number of trees minus one.
skewness : ndarray or float
Skewness, based on moment calculations with denominator equal to
the number of trees, i.e. no degrees of freedom correction.
kurtosis : ndarray or float
Kurtosis (Fisher). The kurtosis is normalized so that it is
zero for the normal distribution. No degrees of freedom are used.
"""
return scipy.stats.describe(self.height_trees)
[docs] def avg_tree_shape(self):
"""
Computes the fraction of trees having each node with respect to a
full binary tree. The fraction is obtained by normalizing the count
by the number of trees composing the ensemble model.
Returns
-------
fractions : scipy.sparse.csr_matrix
Sparse matrix with the same shape of the topology matrix, where
each matrix cell identifies a tree node by a pair of coordinates
row-col, with row highlighting the depth and col the column with
respect to a full binary tree. Each cell value highlights how many
trees have the specific node, normalized by the number of trees.
"""
return self.topology / self.model.n_trees
[docs] def fullness_per_level(self):
"""
Computes the normalized number of trees with full level i, for each
level of a full binary tree. The normalization is done by the number
of trees.
Returns
-------
fullness : np.array
An array long as the maximum height of a tree in the ensemble, and
where the j-th cell highlight how much the j-th level of the trees
is full (normalized by the number of trees).
"""
# Row-sums are directly supported, and the structure of the CSR format means that
# the difference between successive values in the indptr array correspond exactly
# to the number of nonzero elements in each row.
sums = self.topology.sum(axis=1).A1
counts = np.diff(self.topology.indptr)
return sums / counts / self.model.n_trees