Source code for rankeval.dataset.dataset

# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
This module implements the generic class for loading/dumping a dataset from/to file.
"""
import numpy as np
import copy

from .svmlight_format import load_svmlight_file, dump_svmlight_file


[docs]class Dataset(object): """ This class describe the dataset object, with its utility and features Attributes ---------- X : numpy 2d array of float It is a dense numpy matrix of shape (n_samples, n_features), y : numpy 1d array of float It is a ndarray of shape (n_samples,) with the gold label query_ids : numpy 1d array of int It is a ndarray of shape(nsamples,) name : str The name to give to the dataset n_instances : int The number of instances in the dataset n_features : int The number of features in the dataset n_queries : int The number of queries in the dataset """ def __init__(self, X, y, query_ids, name=None): """ This module implements the generic class for loading/dumping a dataset from/to file. Parameters ---------- X : numpy.ndarray The matrix with feature values y : numpy.array The vector with label values query_ids : numpy.array The vector with the query_id for each sample. """ if len(query_ids) == X.shape[0]: # convert from query_ids per sample to query offset self.query_ids = np.append(np.unique(query_ids, return_index=True)[1], query_ids.size) else: self.query_ids = query_ids self.X, self.y = X, y self.name = "Dataset %s" % (self.X.shape,) if name is not None: self.name = name self.n_instances = len(self.y) self.n_features = self.X.shape[1] self.n_queries = len(self.query_ids) - 1
[docs] @staticmethod def load(f, name=None, format="svmlight"): """ This static method implements the loading of a dataset from file. Parameters ---------- f : path The file name of the dataset to load name : str The name to be given to the current dataset format : str The format of the dataset file to load (actually supported is only "svmlight" format) Returns ------- dataset : Dataset The dataset read from file """ if format == "svmlight": X, y, query_ids = load_svmlight_file(f, query_id=True) else: raise TypeError("Dataset format %s is not yet supported!" % format) return Dataset(X, y, query_ids, name)
[docs] def subset_features(self, features): """ Create a new Dataset with only the features identified by the given features parameters (indices). It is useful for performing feature selection. Parameters ---------- features : numpy array or list The indices of the features to select in the resulting dataset Returns ------- dataset : rankeval.dataset.Dataset The resulting dataset with the given subset of features """ new_dataset = copy.deepcopy(self) new_dataset.X = new_dataset.X[:, features] return new_dataset
[docs] def dump(self, f, format): """ This method implements the writing of a previously loaded dataset according to the given format on file Parameters ---------- f : path The file path where to store the dataset format : str The format to use for dumping the dataset on file (actually supported is only "svmlight" format) """ if len(self.query_ids) != self.X.shape[0]: # we need to unroll the query_ids (it is compacted: it reports only # the offset where a new query id starts) query_ids = np.ndarray(self.X.shape[0], dtype=np.float32) last_idx = 0 for qid, qid_offset in enumerate(self.query_ids, start=1): for idx in np.arange(last_idx, qid_offset): query_ids[idx] = qid last_idx = qid_offset else: query_ids = self.query_ids if format == "svmlight": dump_svmlight_file(self.X, self.y, f, query_ids) else: raise TypeError("Dataset format %s is not yet supported!" % format)
[docs] def clear_X(self): """ This method clears the space used by the dataset instance for storing X (the dataset features). This space is used only for scoring, thus it can be freed after. """ del self.X self.X = None
[docs] def query_offset_iterator(self): """ This method implements and iterator over the offsets of the query_ids in the dataset. Returns ------- offsets : tuple of (int, int) The row index of instances belonging to the same query. The two indices represent (start, end) offsets. """ for i in np.arange(len(self.query_ids) - 1): yield self.query_ids[i], self.query_ids[i+1]
def __str__(self): return self.name def __eq__(self, other): return (self.X == other.X).all() and \ (self.y == other.y).all() and \ (self.query_ids == other.query_ids).all() def __ne__(self, other): # Not strictly necessary, but to avoid having both x==y and x!=y # True at the same time return not(self == other)