Source code for rankeval.analysis.feature

# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>, 
#          Claudio Lucchese <claudio.lucchese@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
This package implements feature importance analysis.
"""

import numpy as np
from collections import deque
import xarray as xr

from ..model import RTEnsemble
from ..metrics import MSE, RMSE

from _efficient_feature import eff_feature_importance, \
    eff_feature_importance_tree


[docs]def feature_importance(model, dataset, metric=None): """ This method computes the feature importance relative to the given model and dataset. Parameters ---------- dataset : Dataset The dataset used to evaluate the model (typically the one used to train the model). model : RTEnsemble The model whose features we want to evaluate. metric : rankeval.metrics.Metric The metric to use for compute the feature gain at each split node. The default metric is the Root Mean Squared Error (MSE). Returns ------- feature_importance : xarray.DataArray A DataArray containing the feature importance scores, one for each feature of the given model scored on the given dataset. Two main information are stored in the DataArray: - feature_importance: A vector of importance values, one for each feature in the given model. The importance values reported are the sum of the improvements, in terms of MSE, of each feature, evaluated on the given dataset. The improvements are computed as the delta MSE before a split node and after, evaluating how much the MSE is improved as a result of the split. - feature_count: A vector of count values, one for each feature in the given model. The count values reported highlights the number of times each feature is used in a split node, i.e., to improve the MSE. """ if metric is None: metric = MSE() if isinstance(metric, RMSE) or isinstance(metric, MSE): feature_imp, feature_count = eff_feature_importance(model, dataset) if isinstance(metric, RMSE): feature_imp[0] = np.sqrt(feature_imp[0]) else: # initialize features importance feature_imp = np.zeros(dataset.n_features, dtype=np.float32) # initialize features count feature_count = np.zeros(dataset.n_features, dtype=np.uint16) # default scores on the root node of the first tree y_pred = np.zeros(dataset.n_instances) # iterate trees of the model for tree_id in np.arange(model.n_trees): _feature_importance_tree(model, dataset, tree_id, y_pred, metric, feature_imp, feature_count) performance = xr.DataArray([feature_imp, feature_count], name='Feature Importance Analysis', coords=[['importance', 'count'], np.arange(dataset.n_features, dtype=np.uint16)], dims=['type', 'feature']) return performance
def _feature_importance_tree(model, dataset, tree_id, y_pred, metric, feature_imp, feature_count): """ This method computes the feature importance relative to a single tree of the given model. Parameters ---------- dataset : Dataset The dataset used to evaluate the model (typically the one used to train the model). model : RTEnsemble The model whose features we want to evaluate. tree_id: int Id of the root of the tree to be evaluated. y_pred : numpy.array Current instance predictions. metric : rankeval.metrics.Metric The metric to use for compute the feature gain at each split node. The default metric is the Root Mean Squared Error (MSE). feature_imp : numpy.array The feature importance array, to be updated with the evaluation of the current tree. feature_ : numpy.array The feature count array, to be updated with the evaluation of the current tree. Returns ------- y_pred_tree : numpy.array A vector of delta instance scores relative to the current tree. """ if isinstance(metric, RMSE) or isinstance(metric, MSE): y_pred_tree = eff_feature_importance_tree( model, dataset, tree_id, y_pred, feature_imp, feature_count) if isinstance(metric, RMSE): map(np.math.sqrt, feature_imp) return y_pred_tree # The residual scores to fit y_target = dataset.y - y_pred # Set the default y_pred vector to the mean residual score y_pred_tree = np.full(dataset.n_instances, fill_value=y_target.mean()) # queue with node_id to be visited and list of documents in the node root_node_id = model.trees_root[tree_id] deq = deque([(root_node_id, 0, None)]) # we score the tree on a level basis, so that the feature_gain of each split # node is computed accordingly to the predictions of the parent level # (i.e., all the nodes in the tree with a depth < node_depth) # depth 0 prediction correspond to the mean residual score y_pred_tree_depth = [y_pred_tree] # support array for storing the local modification made by a split node y_pred_mod = np.empty(dataset.n_instances, dtype=np.float32) while len(deq) > 0: # current node info node_id, depth, doc_list = deq.popleft() feature_id = model.trees_nodes_feature[node_id] threshold = model.trees_nodes_value[node_id] feature_count[feature_id] += 1 # this is good only for the internal nodes (non root) if doc_list is not None: left_docs = \ doc_list[dataset.X[doc_list, feature_id] <= threshold] right_docs = \ doc_list[dataset.X[doc_list, feature_id] > threshold] else: left_docs = np.where(dataset.X[:, feature_id] <= threshold)[0] right_docs = np.where(dataset.X[:, feature_id] > threshold)[0] # copy parent predictions inside y_pred_mod y_pred_mod[:] = y_pred_tree_depth[depth] # before updating leaves # dataset_y = y_target + y_pred pre_slit_metric, _ = metric.eval(dataset, y_pred_mod + y_pred) # ((y_target[doc_list] - y_pred_mod[doc_list]) ** 2.0).sum() # update y_pred including the weight of the current tree if left_docs.size > 0: y_pred_mod[left_docs] = y_target[left_docs].mean() if right_docs.size > 0: y_pred_mod[right_docs] = y_target[right_docs].mean() # update the y_pred of the next level if depth + 1 > len(y_pred_tree_depth) - 1: y_pred_tree_depth.append(y_pred_tree_depth[depth].copy()) y_pred_tree_depth[depth + 1][doc_list] = y_pred_mod[doc_list] # after updating leaves post_split_metric, _ = metric.eval(dataset, y_pred_mod + y_pred) # compute the new metric score delta_metric = pre_slit_metric - post_split_metric # update feature importance feature_imp[feature_id] += delta_metric # if children are not leaves, add in the queue of the nodes to visit if not model.is_leaf_node(model.trees_left_child[node_id]): deq.append((model.trees_left_child[node_id], depth+1, left_docs)) if not model.is_leaf_node(model.trees_right_child[node_id]): deq.append((model.trees_right_child[node_id], depth+1, right_docs)) # update the leaves output including the tree weight y_pred_tree_depth[-1] *= model.trees_weight[tree_id] # update y_pred with the predictions made by the last level of the tree y_pred += y_pred_tree_depth[-1] return y_pred_tree_depth[-1]