Source code for lalegpl.datasets.auto_weka

# Copyright 2019 IBM Corporation
#
# Licensed under the GNU General Public License 3.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.gnu.org/licenses/gpl-3.0.txt
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
import os
import pandas
import tempfile
import urllib
import zipfile
import arff

[docs]def fetch(dataset, data_home=None, convert_strings_to_integers = True):
    if data_home is None:
        data_home = os.path.join('~', 'lale_data')
    data_home = os.path.expanduser(data_home)
    base_url = 'https://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets'
    zip_url = '{}/{}.zip'.format(base_url, dataset)
    data_dir = os.path.join(data_home, 'auto_weka', dataset)
    train_file = os.path.join(data_dir, 'train.arff')
    test_file = os.path.join(data_dir, 'test.arff')
    if not os.path.exists(train_file) or not os.path.exists(test_file):
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
            print('created directory {}'.format(data_dir))
        with tempfile.NamedTemporaryFile(suffix=".zip") as zip_file:
            urllib.request.urlretrieve(zip_url, zip_file.name)
            with zipfile.ZipFile(zip_file.name) as myzip:
                if not os.path.exists(train_file):
                    myzip.extract('train.arff', data_dir)
                if not os.path.exists(test_file):
                    myzip.extract('test.arff', data_dir)
    assert os.path.exists(train_file) and os.path.exists(test_file)
    def col_name(attributes, i):
        name, typ = attributes[i]
        #TODO: this currently only works for categoricals
        assert type(typ) is list #e.g., ['vhigh', 'high', 'med', 'low']
        return name
    def col_type(attributes, i):
        name, typ = attributes[i]
        #TODO: this currently only works for categoricals
        assert type(typ) is list #e.g., ['vhigh', 'high', 'med', 'low']
        return typ
    def col_list(data, i):
        return [row[i] for row in data]
    def col_list_strings_as_integers(data, i, col_type):
        from sklearn.preprocessing import LabelEncoder
        if type(col_type) is list: # categorical such as, ['vhigh', 'high', 'med', 'low']
            le = LabelEncoder()
            le.fit(col_type)
            return [le.transform([row[i]])[0] for row in data]
        else:
            return [row[i] for row in data]
    def make_X(data_dict):
        attributes, data = data_dict['attributes'], data_dict['data']
        indices = range(len(attributes) - 1)
        if convert_strings_to_integers:
            dict_of_lists = {col_name(attributes, i): col_list_strings_as_integers(data, i, col_type(attributes, i))
                            for i in indices}
        else:
            dict_of_lists = {col_name(attributes, i): col_list(data, i)
                            for i in indices}
        return pandas.DataFrame(dict_of_lists)
    def make_y(data_dict):
        attributes, data = data_dict['attributes'], data_dict['data']
        i = len(attributes) - 1
        return pandas.Series(col_list(data, i), name=col_name(attributes, i))
    with open(train_file) as f:
        train_dict = arff.load(f)
    train_X, train_y = make_X(train_dict), make_y(train_dict)
    with open(test_file) as f:
        test_dict = arff.load(f)
    test_X, test_y = make_X(test_dict), make_y(test_dict)
    return (train_X, train_y), (test_X, test_y)

[docs]def fetch_car(data_home=None, convert_strings_to_integers = True):
    return fetch('car', data_home, convert_strings_to_integers)