Source code for lale.datasets.sklearn_to_pandas

# Copyright 2019 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import lale.datasets.data_schemas
from lale.datasets.util import load_boston


def _bunch_to_df(bunch, schema_X, schema_y, test_size=0.2, random_state=42):
    train_X_arr, test_X_arr, train_y_arr, test_y_arr = train_test_split(
        bunch.data, bunch.target, test_size=test_size, random_state=random_state
    )
    feature_schemas = schema_X["items"]["items"]
    if isinstance(feature_schemas, list):
        feature_names = [f["description"] for f in feature_schemas]
    else:
        feature_names = [f"x{i}" for i in range(schema_X["items"]["maxItems"])]
    train_X_df = pd.DataFrame(train_X_arr, columns=feature_names)
    test_X_df = pd.DataFrame(test_X_arr, columns=feature_names)
    train_y_df = pd.Series(train_y_arr, name="target")
    test_y_df = pd.Series(test_y_arr, name="target")
    train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0]
    train_X = lale.datasets.data_schemas.add_schema(
        train_X_df, {**schema_X, "minItems": train_nrows, "maxItems": train_nrows}
    )
    test_X = lale.datasets.data_schemas.add_schema(
        test_X_df, {**schema_X, "minItems": test_nrows, "maxItems": test_nrows}
    )
    train_y = lale.datasets.data_schemas.add_schema(
        train_y_df, {**schema_y, "minItems": train_nrows, "maxItems": train_nrows}
    )
    test_y = lale.datasets.data_schemas.add_schema(
        test_y_df, {**schema_y, "minItems": test_nrows, "maxItems": test_nrows}
    )
    return (train_X, train_y), (test_X, test_y)


[docs]def load_iris_df(test_size=0.2): iris = sklearn.datasets.load_iris() X = iris.data y = iris.target target_name = "target" X, y = shuffle(iris.data, iris.target, random_state=42) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42 ) X_train_df = pd.DataFrame(X_train, columns=iris.feature_names) y_train_df = pd.Series(y_train, name=target_name) X_test_df = pd.DataFrame(X_test, columns=iris.feature_names) y_test_df = pd.Series(y_test, name=target_name) return (X_train_df, y_train_df), (X_test_df, y_test_df)
[docs]def digits_df(test_size=0.2, random_state=42): digits = sklearn.datasets.load_digits() ncols = digits.data.shape[1] schema_X = { "description": "Features of digits dataset (classification).", "documentation_url": "https://scikit-learn.org/0.20/datasets/index.html#optical-recognition-of-handwritten-digits-dataset", "type": "array", "items": { "type": "array", "minItems": ncols, "maxItems": ncols, "items": {"type": "number", "minimum": 0, "maximum": 16}, }, } schema_y = { "$schema": "http://json-schema.org/draft-04/schema#", "type": "array", "items": {"type": "integer", "minimum": 0, "maximum": 9}, } (train_X, train_y), (test_X, test_y) = _bunch_to_df( digits, schema_X, schema_y, test_size, random_state ) return (train_X, train_y), (test_X, test_y)
[docs]def covtype_df(test_size=0.2, random_state=42): covtype = sklearn.datasets.fetch_covtype() schema_X = { "description": "Features of forest covertypes dataset (classification).", "documentation_url": "https://scikit-learn.org/0.20/datasets/index.html#forest-covertypes", "type": "array", "items": { "type": "array", "minItems": 54, "maxItems": 54, "items": [ {"description": "Elevation", "type": "integer"}, {"description": "Aspect", "type": "integer"}, {"description": "Slope", "type": "integer"}, {"description": "Horizontal_Distance_To_Hydrology", "type": "integer"}, {"description": "Vertical_Distance_To_Hydrology", "type": "integer"}, {"description": "Horizontal_Distance_To_Roadways", "type": "integer"}, {"description": "Hillshade_9am", "type": "integer"}, {"description": "Hillshade_Noon", "type": "integer"}, {"description": "Hillshade_3pm", "type": "integer"}, { "description": "Horizontal_Distance_To_Fire_Points", "type": "integer", }, {"description": "Wilderness_Area1", "enum": [0, 1]}, {"description": "Wilderness_Area2", "enum": [0, 1]}, {"description": "Wilderness_Area3", "enum": [0, 1]}, {"description": "Wilderness_Area4", "enum": [0, 1]}, {"description": "Soil_Type1", "enum": [0, 1]}, {"description": "Soil_Type2", "enum": [0, 1]}, {"description": "Soil_Type3", "enum": [0, 1]}, {"description": "Soil_Type4", "enum": [0, 1]}, {"description": "Soil_Type5", "enum": [0, 1]}, {"description": "Soil_Type6", "enum": [0, 1]}, {"description": "Soil_Type7", "enum": [0, 1]}, {"description": "Soil_Type8", "enum": [0, 1]}, {"description": "Soil_Type9", "enum": [0, 1]}, {"description": "Soil_Type10", "enum": [0, 1]}, {"description": "Soil_Type11", "enum": [0, 1]}, {"description": "Soil_Type12", "enum": [0, 1]}, {"description": "Soil_Type13", "enum": [0, 1]}, {"description": "Soil_Type14", "enum": [0, 1]}, {"description": "Soil_Type15", "enum": [0, 1]}, {"description": "Soil_Type16", "enum": [0, 1]}, {"description": "Soil_Type17", "enum": [0, 1]}, {"description": "Soil_Type18", "enum": [0, 1]}, {"description": "Soil_Type19", "enum": [0, 1]}, {"description": "Soil_Type20", "enum": [0, 1]}, {"description": "Soil_Type21", "enum": [0, 1]}, {"description": "Soil_Type22", "enum": [0, 1]}, {"description": "Soil_Type23", "enum": [0, 1]}, {"description": "Soil_Type24", "enum": [0, 1]}, {"description": "Soil_Type25", "enum": [0, 1]}, {"description": "Soil_Type26", "enum": [0, 1]}, {"description": "Soil_Type27", "enum": [0, 1]}, {"description": "Soil_Type28", "enum": [0, 1]}, {"description": "Soil_Type29", "enum": [0, 1]}, {"description": "Soil_Type30", "enum": [0, 1]}, {"description": "Soil_Type31", "enum": [0, 1]}, {"description": "Soil_Type32", "enum": [0, 1]}, {"description": "Soil_Type33", "enum": [0, 1]}, {"description": "Soil_Type34", "enum": [0, 1]}, {"description": "Soil_Type35", "enum": [0, 1]}, {"description": "Soil_Type36", "enum": [0, 1]}, {"description": "Soil_Type37", "enum": [0, 1]}, {"description": "Soil_Type38", "enum": [0, 1]}, {"description": "Soil_Type39", "enum": [0, 1]}, {"description": "Soil_Type40", "enum": [0, 1]}, ], }, } schema_y = { "description": "Target of forest covertypes dataset (classification).", "documentation_url": "https://scikit-learn.org/0.20/datasets/index.html#forest-covertypes", "type": "array", "items": { "description": "The cover type, i.e., the dominant species of trees.", "enum": [0, 1, 2, 3, 4, 5, 6], }, } (train_X, train_y), (test_X, test_y) = _bunch_to_df( covtype, schema_X, schema_y, test_size, random_state ) return (train_X, train_y), (test_X, test_y)
[docs]def california_housing_df(test_size=0.2, random_state=42): housing = sklearn.datasets.fetch_california_housing() schema_X = { "description": "Features of California housing dataset (regression).", "documentation_url": "https://scikit-learn.org/0.20/datasets/index.html#california-housing-dataset", "type": "array", "items": { "type": "array", "minItems": 8, "maxItems": 8, "items": [ {"description": "MedInc", "type": "number", "minimum": 0.0}, {"description": "HouseAge", "type": "number", "minimum": 0.0}, {"description": "AveRooms", "type": "number", "minimum": 0.0}, {"description": "AveBedrms", "type": "number", "minimum": 0.0}, {"description": "Population", "type": "number", "minimum": 0.0}, {"description": "AveOccup", "type": "number", "minimum": 0.0}, {"description": "Latitude", "type": "number", "minimum": 0.0}, {"description": "Longitude", "type": "number"}, ], }, } schema_y = { "description": "Target of California housing dataset (regression).", "documentation_url": "https://scikit-learn.org/0.20/datasets/index.html#california-housing-dataset", "type": "array", "items": { "description": "Median house value for California districts.", "type": "number", "minimum": 0.0, }, } (train_X, train_y), (test_X, test_y) = _bunch_to_df( housing, schema_X, schema_y, test_size, random_state ) return (train_X, train_y), (test_X, test_y)
[docs]def boston_housing_df(test_size=0.2, random_state=42): housing = load_boston() schema_X = { "description": "Features of Boston house prices dataset (regression).", "documentation_url": "https://scikit-learn.org/0.20/datasets/index.html#boston-house-prices-dataset", "type": "array", "items": { "type": "array", "minItems": 13, "maxItems": 13, "items": [ {"description": "CRIM", "type": "number", "minimum": 0.0}, {"description": "ZN", "type": "number", "minimum": 0.0}, {"description": "INDUS", "type": "number", "minimum": 0.0}, {"description": "CHAS", "enum": [0, 1]}, {"description": "NOX", "type": "number", "minimum": 0.0}, {"description": "RM", "type": "number", "minimum": 1.0}, {"description": "AGE", "type": "number", "minimum": 0.0}, {"description": "DIS", "type": "number", "minimum": 0.0}, {"description": "RAD", "type": "number", "minimum": 1}, {"description": "TAX", "type": "number", "minimum": 0.0}, {"description": "PRATIO", "type": "number", "minimum": 0.0}, {"description": "B", "type": "number", "minimum": 0.0}, {"description": "LSTAT", "type": "number", "minimum": 0.0}, ], }, } schema_y = { "description": "Target of Boston house prices dataset (regression).", "documentation_url": "https://scikit-learn.org/0.20/datasets/index.html#boston-house-prices-dataset", "type": "array", "items": { "description": "Median value of owner-occupied homes in $1000's (MEDV)", "type": "number", "minimum": 0.0, }, } (train_X, train_y), (test_X, test_y) = _bunch_to_df( housing, schema_X, schema_y, test_size, random_state ) return (train_X, train_y), (test_X, test_y)