Source code for rankeval.dataset.dataset

# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
This module implements the generic class for loading/dumping a dataset from/to file.
"""
import numpy as np
import copy

from .svmlight_format import load_svmlight_file, dump_svmlight_file


[docs]class Dataset(object):
    """
    This class describe the dataset object, with its utility and features

    Attributes
    ----------
    X : numpy 2d array of float
        It is a dense numpy matrix of shape (n_samples, n_features),
    y : numpy 1d array of float
        It is a ndarray of shape (n_samples,) with the gold label
    query_ids : numpy 1d array of int
        It is a ndarray of shape(nsamples,)
    name : str
        The name to give to the dataset
    n_instances : int
        The number of instances in the dataset
    n_features : int
        The number of features in the dataset
    n_queries : int
        The number of queries in the dataset
    """

    def __init__(self, X, y, query_ids, name=None):
        """
        This module implements the generic class for loading/dumping a dataset from/to file.

        Parameters
        ----------
        X : numpy.ndarray
            The matrix with feature values
        y : numpy.array
            The vector with label values
        query_ids : numpy.array
            The vector with the query_id for each sample.
        """

        if len(query_ids) == X.shape[0]:
            # convert from query_ids per sample to query offset
            self.query_ids = np.append(np.unique(query_ids, return_index=True)[1],
                                       query_ids.size)
        else:
            self.query_ids = query_ids

        self.X, self.y = X, y
        self.name = "Dataset %s" % (self.X.shape,)
        if name is not None:
            self.name = name

        self.n_instances = len(self.y)
        self.n_features = self.X.shape[1]
        self.n_queries = len(self.query_ids) - 1

[docs]    @staticmethod
    def load(f, name=None, format="svmlight"):
        """
        This static method implements the loading of a dataset from file.

        Parameters
        ----------
        f : path
            The file name of the dataset to load
        name : str
            The name to be given to the current dataset
        format : str
            The format of the dataset file to load (actually supported is only "svmlight" format)

        Returns
        -------
        dataset : Dataset
            The dataset read from file
        """
        if format == "svmlight":
            X, y, query_ids = load_svmlight_file(f, query_id=True)
        else:
            raise TypeError("Dataset format %s is not yet supported!" % format)
        return Dataset(X, y, query_ids, name)

[docs]    def subset_features(self, features):
        """
        Create a new Dataset with only the features identified by the given
        features parameters (indices). It is useful for performing feature
        selection.

        Parameters
        ----------
        features : numpy array or list
            The indices of the features to select in the resulting dataset

        Returns
        -------
        dataset : rankeval.dataset.Dataset
            The resulting dataset with the given subset of features
        """
        new_dataset = copy.deepcopy(self)
        new_dataset.X = new_dataset.X[:, features]
        return new_dataset

[docs]    def dump(self, f, format):
        """
        This method implements the writing of a previously loaded dataset according to the given format on file

        Parameters
        ----------
        f : path
            The file path where to store the dataset
        format : str
            The format to use for dumping the dataset on file (actually supported is only "svmlight" format)
        """
        if len(self.query_ids) != self.X.shape[0]:
            # we need to unroll the query_ids (it is compacted: it reports only
            # the offset where a new query id starts)
            query_ids = np.ndarray(self.X.shape[0], dtype=np.float32)
            last_idx = 0
            for qid, qid_offset in enumerate(self.query_ids, start=1):
                for idx in np.arange(last_idx, qid_offset):
                    query_ids[idx] = qid
                last_idx = qid_offset
        else:
            query_ids = self.query_ids

        if format == "svmlight":
            dump_svmlight_file(self.X, self.y, f, query_ids)
        else:
            raise TypeError("Dataset format %s is not yet supported!" % format)

[docs]    def clear_X(self):
        """
        This method clears the space used by the dataset instance for storing X (the dataset features).
        This space is used only for scoring, thus it can be freed after.

        """
        del self.X
        self.X = None

[docs]    def query_offset_iterator(self):
        """
        This method implements and iterator over the offsets of the query_ids
        in the dataset.

        Returns
        -------
        offsets : tuple of (int, int)
            The row index of instances belonging to the same query.
            The two indices represent (start, end) offsets.

        """
        for i in np.arange(len(self.query_ids) - 1):
            yield self.query_ids[i], self.query_ids[i+1]

    def __str__(self):
        return self.name

    def __eq__(self, other):
        return (self.X == other.X).all() and \
               (self.y == other.y).all() and \
               (self.query_ids == other.query_ids).all()

    def __ne__(self, other):
        # Not strictly necessary, but to avoid having both x==y and x!=y
        # True at the same time
        return not(self == other)