Source code for rankeval.model.proxy_ScikitLearn

# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""Class providing the implementation for loading/storing a XGBoost model
from/to file. The model has to be saved using textual representation, i.e., by
using the following method:
.. code-block:: python
    import xgboost as xgb
    ...
    bst = xgb.train(param, dtrain, num_round)
    bst.dump_model('xgboost.model')

The XGBoost project is described here:
    https://github.com/dmlc/xgboost

The XGBoost format adopts a textual representation where each line of the file
represent a single split node or a leaf node, with several attributes describing
the feature and the threshold involved (in case of a split node) or the output
(in case of a leaf). Each node is identified by a unique integer as well as
additional information not usefull for rankeval and thus ignored.
"""

import re

import numpy as np

from rt_ensemble import RTEnsemble

base_score_reg = re.compile("^base_score=(.+)$")
learning_rate_reg = re.compile("^learning_rate=(.+)$")
tree_reg = re.compile("^booster\[(\d+)\]")
node_reg = re.compile("(\d+):\[f(\d+)<=(.*)\]")
leaf_reg = re.compile("(\d+):leaf=(.+)$")


[docs]class ProxyScikitLearn(object): """ Class providing the implementation for loading/storing a Scikit-Learn model from/to file. """
[docs] @staticmethod def load(file_path, model): """ Load the model from the file identified by file_path. Parameters ---------- file_path : str The path to the filename where the model has been saved model : RTEnsemble The model instance to fill """ n_trees, n_nodes = ProxyScikitLearn._count_nodes(file_path) # Initialize the model and allocate the needed space # given the shape and size of the ensemble model.initialize(n_trees, n_nodes) root_node = 0 num_nodes = 0 learning_rate = 1 queue = list() with open(file_path, 'r') as f: for line in f: match = base_score_reg.match(line) if match: model.base_score = float(match.group(1)) continue match = learning_rate_reg.match(line) if match: model.learning_rate = float(match.group(1)) continue match_tree = tree_reg.match(line) if match_tree: assert(len(queue) == 0) curr_tree = int(match_tree.group(1)) root_node += num_nodes num_nodes = 0 model.trees_root[curr_tree] = root_node model.trees_weight[curr_tree] = 1 continue match_node = node_reg.search(line) if match_node: node_id = int(match_node.group(1).strip()) + root_node feature_id = int(match_node.group(2).strip()) threshold = float(match_node.group(3).strip()) model.trees_nodes_feature[node_id] = feature_id model.trees_nodes_value[node_id] = threshold match_leaf = leaf_reg.search(line) if match_leaf: node_id = int(match_leaf.group(1).strip()) + root_node leaf_value = float(match_leaf.group(2).strip()) model.trees_nodes_value[node_id] = leaf_value if match_node or match_leaf: num_nodes += 1 if len(queue) > 0: parent_id, child = queue.pop() if child == 'L': model.trees_left_child[parent_id] = node_id else: model.trees_right_child[parent_id] = node_id if match_node: # two elements in the queue for the left and right children # Each element is identified by a node_id and the indication # of being the left or right child. queue.extend([(node_id, 'R'), (node_id, 'L')])
[docs] @staticmethod def save(file_path, model): """ Save the model onto the file identified by file_path. Parameters ---------- file_path : str The path to the filename where the model has to be saved model : RTEnsemble The model RTEnsemble model to save on file Returns ------- status : bool Returns true if the save is successful, false otherwise """ raise NotImplementedError("Feature not implemented!")
@staticmethod def _count_nodes(file_path): """ Count the total number of nodes (both split and leaf nodes) in the model identified by file_path. Parameters ---------- file_path : str The path to the filename where the model has been saved Returns ------- tuple(n_trees, n_nodes) : tuple(int, int) The total number of trees and nodes (both split and leaf nodes) in the model identified by file_path. """ n_nodes = 0 n_trees = 0 with open(file_path, 'r') as f: for line in f: match = tree_reg.match(line) if match: n_trees += 1 continue match_node = node_reg.search(line) if match_node: n_nodes += 1 match_leaf = leaf_reg.search(line) if match_leaf: n_nodes += 1 return n_trees, n_nodes
[docs] @staticmethod def export_scikit_model(model, file_path): if not hasattr(model, 'estimators_'): raise TypeError("Only ensemble-based models are supported!") if not hasattr(model, 'init_'): raise TypeError("Base score missing!") if hasattr(model.init_, "quantile"): base_score = model.init_.quantile elif hasattr(model.init_, "mean"): base_score = model.init_.mean else: raise TypeError("Base score unknown!") with open(file_path, 'w') as writer: if not hasattr(model, 'init_'): raise TypeError("Base score missing!") writer.write("base_score=%f\n" % base_score) writer.write("learning_rate=%f\n" % model.learning_rate) for tree_id, tree in enumerate(model.estimators_.flatten()): ProxyScikitLearn._export_tree(writer, tree, tree_id)
@staticmethod def _export_tree(writer, tree, tree_id=0): from sklearn.tree import _tree if not hasattr(tree, 'tree_'): raise TypeError("Only tree-based models are supported!") tree_ = tree.tree_ feature_name = ["f%d" % i for i in np.unique(tree_.feature) if i != _tree.TREE_UNDEFINED] writer.write("booster[%d] [%s]:\n" % (tree_id, ' '.join(feature_name))) def recurse(node, depth): indent = '\t' * depth if tree_.feature[node] != _tree.TREE_UNDEFINED: name = "f%d" % tree_.feature[node] threshold = tree_.threshold[node] writer.write("%s%d:[%s<=%f]\n" % (indent, node, name, threshold)) recurse(tree_.children_left[node], depth + 1) # print "%s%d:[%s>%f]" % (indent, node, name, threshold) recurse(tree_.children_right[node], depth + 1) else: if tree_.value[node].size > 1: leaf_value = "c%d" % np.argmax(tree_.value[node]) else: leaf_value = tree_.value[node].flatten()[0] writer.write("%s%d:leaf=%s\n" % (indent, node, leaf_value)) recurse(0, 1)