Source code for lale.datasets.uci.uci_datasets

# Copyright 2019-2023 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import tempfile
import urllib.request
import zipfile

import numpy as np
import pandas as pd

import lale.datasets.data_schemas

download_data_dir = os.path.join(os.path.dirname(__file__), "download_data")
download_data_url = "http://archive.ics.uci.edu/static/public"


[docs]def download(dataset_id, zip_name, contents_files): zip_url = f"{download_data_url}/{dataset_id}/{zip_name}" data_dir = os.path.join(download_data_dir, dataset_id) if not os.path.exists(data_dir): os.makedirs(data_dir) full_file_names = [os.path.join(data_dir, base) for base in contents_files] def all_downloaded(): for full in full_file_names: if not os.path.exists(full): return False return True if not all_downloaded(): with tempfile.NamedTemporaryFile(suffix=".zip") as tmp_zip_file: # this request is to a string that begins with a hardcoded http url, so does not risk leaking local data urllib.request.urlretrieve(zip_url, tmp_zip_file.name) # nosec with zipfile.ZipFile(tmp_zip_file.name) as myzip: for full, base in zip(full_file_names, contents_files): if not os.path.exists(full): myzip.extract(base, data_dir) assert all_downloaded() return full_file_names
[docs]def tsv_to_Xy(file_name, target_col, schema_orig): data_all = pd.read_csv(file_name, sep="\t") row_schema_X = [ col_schema for col_schema in schema_orig["items"]["items"] if col_schema["description"] != target_col ] columns_X = [col_schema["description"] for col_schema in row_schema_X] data_X = data_all.loc[:, columns_X] nrows, ncols_X = data_X.shape schema_X = { **schema_orig, "minItems": nrows, "maxItems": nrows, "items": { "type": "array", "minItems": ncols_X, "maxItems": ncols_X, "items": row_schema_X, }, } data_X = lale.datasets.data_schemas.add_schema(data_X, schema_X) row_schema_y = [ col_schema for col_schema in schema_orig["items"]["items"] if col_schema["description"] == target_col ] data_y = data_all[target_col] schema_y = { **schema_orig, "minItems": nrows, "maxItems": nrows, "items": row_schema_y[0], } data_y = lale.datasets.data_schemas.add_schema(data_y, schema_y) return data_X, data_y
[docs]def fetch_drugscom(): files = download( "462", "drug+review+dataset+drugs+com.zip", ["drugsComTest_raw.tsv", "drugsComTrain_raw.tsv"], ) target_col = "rating" json_schema = { "$schema": "http://json-schema.org/draft-04/schema#", "type": "array", "items": { "type": "array", "minItems": 6, "maxItems": 6, "items": [ {"description": "drugName", "type": "string"}, { "description": "condition", "anyOf": [{"type": "string"}, {"enum": [np.NaN]}], }, {"description": "review", "type": "string"}, { "description": "rating", "enum": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], }, {"description": "date", "type": "string"}, {"description": "usefulCount", "type": "integer", "minimum": 0}, ], }, } test_X, test_y = tsv_to_Xy(files[0], target_col, json_schema) train_X, train_y = tsv_to_Xy(files[1], target_col, json_schema) return train_X, train_y, test_X, test_y
[docs]def fetch_household_power_consumption(): file_name = download( "235", "individual+household+electric+power+consumption.zip", ["household_power_consumption.txt"], ) df = pd.read_csv(file_name[0], sep=";") return df