Source code for lalegpl.datasets.auto_weka

# Copyright 2019 IBM Corporation
#
# Licensed under the GNU General Public License 3.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.gnu.org/licenses/gpl-3.0.txt
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
import os
import pandas
import tempfile
import urllib
import zipfile
import arff

[docs]def fetch(dataset, data_home=None, convert_strings_to_integers = True): if data_home is None: data_home = os.path.join('~', 'lale_data') data_home = os.path.expanduser(data_home) base_url = 'https://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets' zip_url = '{}/{}.zip'.format(base_url, dataset) data_dir = os.path.join(data_home, 'auto_weka', dataset) train_file = os.path.join(data_dir, 'train.arff') test_file = os.path.join(data_dir, 'test.arff') if not os.path.exists(train_file) or not os.path.exists(test_file): if not os.path.exists(data_dir): os.makedirs(data_dir) print('created directory {}'.format(data_dir)) with tempfile.NamedTemporaryFile(suffix=".zip") as zip_file: urllib.request.urlretrieve(zip_url, zip_file.name) with zipfile.ZipFile(zip_file.name) as myzip: if not os.path.exists(train_file): myzip.extract('train.arff', data_dir) if not os.path.exists(test_file): myzip.extract('test.arff', data_dir) assert os.path.exists(train_file) and os.path.exists(test_file) def col_name(attributes, i): name, typ = attributes[i] #TODO: this currently only works for categoricals assert type(typ) is list #e.g., ['vhigh', 'high', 'med', 'low'] return name def col_type(attributes, i): name, typ = attributes[i] #TODO: this currently only works for categoricals assert type(typ) is list #e.g., ['vhigh', 'high', 'med', 'low'] return typ def col_list(data, i): return [row[i] for row in data] def col_list_strings_as_integers(data, i, col_type): from sklearn.preprocessing import LabelEncoder if type(col_type) is list: # categorical such as, ['vhigh', 'high', 'med', 'low'] le = LabelEncoder() le.fit(col_type) return [le.transform([row[i]])[0] for row in data] else: return [row[i] for row in data] def make_X(data_dict): attributes, data = data_dict['attributes'], data_dict['data'] indices = range(len(attributes) - 1) if convert_strings_to_integers: dict_of_lists = {col_name(attributes, i): col_list_strings_as_integers(data, i, col_type(attributes, i)) for i in indices} else: dict_of_lists = {col_name(attributes, i): col_list(data, i) for i in indices} return pandas.DataFrame(dict_of_lists) def make_y(data_dict): attributes, data = data_dict['attributes'], data_dict['data'] i = len(attributes) - 1 return pandas.Series(col_list(data, i), name=col_name(attributes, i)) with open(train_file) as f: train_dict = arff.load(f) train_X, train_y = make_X(train_dict), make_y(train_dict) with open(test_file) as f: test_dict = arff.load(f) test_X, test_y = make_X(test_dict), make_y(test_dict) return (train_X, train_y), (test_X, test_y)
[docs]def fetch_car(data_home=None, convert_strings_to_integers = True): return fetch('car', data_home, convert_strings_to_integers)