# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Franco Maria Nardini <francomaria.nardini@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import os
import six
import json
import shutil
import tarfile
import fnmatch
from os import environ
from os import makedirs
from os.path import exists
from os.path import expanduser
from os.path import join
from .dataset import Dataset
from .dataset_container import DatasetContainer
if six.PY3:
from urllib.request import urlopen
else:
from urllib2 import urlopen
def __dataset_catalogue__():
resource_path = "http://rankeval.isti.cnr.it/rankeval-datasets/dataset_dictionary.json"
json_file = urlopen(resource_path)
data = json.load(json_file)
return data
def __get_data_home__(data_home=None):
"""
Return the path of the rankeval data dir.
This folder is used by some large dataset loaders to avoid
downloading the data several times.
By default the data dir is set to a folder named 'rankeval_data'
in the user home folder.
Alternatively, it can be set by the 'RANKEVAL_DATA' environment
variable or programmatically by giving an explicit folder path. The
'~' symbol is expanded to the user home folder.
If the folder does not already exist, it is automatically created.
"""
if data_home is None:
data_home = environ.get('RANKEVAL_DATA', join('~', 'rankeval_data'))
data_home = expanduser(data_home)
if not exists(data_home):
makedirs(data_home)
return data_home
def __fetch_dataset_and_models__(dataset_dictionary, fold=None, data_home=None,
download_if_missing=True, force_download=True,
with_models=True):
""" Download a given dataset (and models, if needed).
Parameters
----------
dataset_dictionary : mandatory.
the properties of the requested dataset (name, url, repo, license, etc.)
fold : optional, None by default.
If provided, an integer identifying the specific fold to load.
ex. dataset_name=msn10k, fold=1 will load train/validation/test files
from the 'Fold1' directory. This option holds when using datasets
that are already k-folded.
data_home : optional, default: None.
Specify a data folder for the datasets. If None,
all data is stored in the '~/rankeval_data' subfolder.
download_if_missing : optional, True by default.
If False, raise an IOError if the data is not locally available
instead of trying to download the data from the source site.
force_download : optional, False by default.
If True, download data even if it is on disk.
with_models : optional, True by default.
When True, the method downloads the models generated with different
tools (QuickRank, LightGBM, XGBoost, etc.) to ease the comparison.
"""
data_home = os.path.join(data_home, dataset_dictionary['DATASET_NAME'])
dataset_home = os.path.join(data_home, "dataset")
models_home = os.path.join(data_home, "models")
# DATASET
if not download_if_missing and not os.path.exists(data_home):
raise IOError('dataset not found')
if (fold is not None) and (dataset_dictionary.get('COMMON_SUBFOLDER_NAME') is None):
raise Exception('no k-fold available for the dataset')
# delete data_home if force_download is True, then re-create data_home dir
if force_download:
if os.path.exists(data_home):
shutil.rmtree(data_home)
os.makedirs(data_home)
# preparing file names...
archive_name = os.path.join(dataset_home, dataset_dictionary['DATASET_ARCHIVE_NAME'])
models_archive_name = os.path.join(models_home, dataset_dictionary['MODELS_ARCHIVE_NAME'])
if fold is None:
train_file_path = os.path.join(dataset_home, dataset_dictionary['TRAIN_FILE'])
test_file_path = os.path.join(dataset_home, dataset_dictionary['TEST_FILE'])
if dataset_dictionary.get('VALIDATION_FILE') is not None:
validation_file_path = os.path.join(dataset_home, dataset_dictionary['VALIDATION_FILE'])
else:
subfolder_fold = os.path.join(dataset_home, dataset_dictionary['COMMON_SUBFOLDER_NAME'] + str(fold))
train_file_path = os.path.join(subfolder_fold, dataset_dictionary['TRAIN_FILE'])
test_file_path = os.path.join(subfolder_fold, dataset_dictionary['TEST_FILE'])
if dataset_dictionary.get('VALIDATION_FILE') is not None:
validation_file_path = os.path.join(subfolder_fold, dataset_dictionary['VALIDATION_FILE'])
model_subfolder = os.path.join(models_home, dataset_dictionary['COMMON_SUBFOLDER_NAME'] + str(fold))
# everything will be stored in a dictionary to return
data = dict()
dataset_already_downloaded = False
if os.path.exists(dataset_home):
dataset_already_downloaded = True
if not dataset_already_downloaded:
os.makedirs(dataset_home)
print "Downloading dataset. This may take a few minutes."
data_url = dataset_dictionary['DATASET_URL']
print "Downloading dataset from %s " % data_url
opener = urlopen(data_url)
with open(archive_name, 'wb') as f:
f.write(opener.read())
print "Decompressing %s" % archive_name
tarfile.open(archive_name, "r:gz").extractall(path=dataset_home)
os.remove(archive_name)
license_agreement = ""
if dataset_dictionary.get('LICENSE_FILE') is not None:
for line in open(os.path.join(dataset_home, dataset_dictionary['LICENSE_FILE']), 'r'):
license_agreement += line
else:
license_agreement = "Please, check the terms of use before using the dataset. Here the link: %s" % dataset_dictionary['BLOG_POST_URL']
# filling data structure to return
data['train'] = train_file_path
data['test'] = test_file_path
if dataset_dictionary.get('VALIDATION_FILE') is not None:
data['validation'] = validation_file_path
data['license_agreement'] = license_agreement
# MODELS
if with_models is True:
models_already_downloaded = False
if os.path.exists(models_home):
models_already_downloaded = True
if not models_already_downloaded:
os.makedirs(models_home)
models_url = dataset_dictionary['MODELS_URL']
print "Downloading letor models from %s" % models_url
opener = urlopen(models_url)
with open(models_archive_name, 'wb') as f:
f.write(opener.read())
print "Decompressing %s" % models_archive_name
tarfile.open(models_archive_name, "r:gz").extractall(
path=models_home)
os.remove(models_archive_name)
# filling data structure to return
matches = []
if fold is None:
for root, dirnames, filenames in os.walk(models_home):
for filename in fnmatch.filter(filenames, '*'):
matches.append(os.path.join(root, filename))
else:
for root, dirnames, filenames in os.walk(model_subfolder):
for filename in fnmatch.filter(filenames, '*'):
matches.append(os.path.join(root, filename))
data['models'] = matches
return data
[docs]def load_dataset(dataset_name, fold=None, download_if_missing=True,
force_download=False, with_models=True):
"""
The method allow to download a given dataset (and available models)
by providing its name.
Datasets and models are available at the following link:
http://rankeval.isti.cnr.it/rankeval-datasets/dataset_dictionary.json
Parameters
----------
dataset_name:
The name of the dataset (and models) to download.
fold : optional, None by default.
If provided, an integer identifying the specific fold to load.
Example: dataset_name=msn10k, fold=1, will load train/validation/test
files from the 'Fold1' directory. This option holds when using datasets
that are already k-folded.
download_if_missing : optional, True by default.
If False, raise an IOError if the data is not locally available
instead of trying to download the data from the source site.
force_download : optional, False by default.
If True, download data even if it is on disk.
with_models : optional, True by default.
When True, the method downloads the models generated with different
tools (QuickRank, LightGBM, XGBoost, etc.) to ease the comparison.
"""
dataset_catalogue = __dataset_catalogue__()
dataset_dictionary = dataset_catalogue.get(dataset_name)
if dataset_dictionary is None:
return None
data_home = __get_data_home__()
data = __fetch_dataset_and_models__(dataset_dictionary, fold, data_home,
download_if_missing,
force_download,
with_models)
dataset_name = dataset_dictionary['DATASET_NAME']
dataset_format = dataset_dictionary['DATASET_FORMAT']
container = DatasetContainer()
print "Loading files. This may take a few minutes."
if data.get('train') is not None:
train_dataset = Dataset.load(data['train'], name=dataset_name + "_train", format=dataset_format)
container.train_dataset = train_dataset
if data.get('test') is not None:
test_dataset = Dataset.load(data['test'], name=dataset_name + "_test", format=dataset_format)
container.test_dataset = test_dataset
if data.get('validation') is not None:
validation_dataset = Dataset.load(data['validation'], name=dataset_name + "_validation", format=dataset_format)
container.validation_dataset = validation_dataset
container.license_agreement = data['license_agreement']
if with_models:
container.model_filenames = data['models']
print "done loading dataset!"
return container