Source code for lale.datasets.multitable.util

# Copyright 2019, 2020, 2021 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, List, Tuple

from numpy import random

try:
    from pyspark.sql import SparkSession

    spark_installed = True
except ImportError:
    spark_installed = False


from lale.datasets.data_schemas import add_table_name, get_table_name
from lale.helpers import _is_pandas_df, _is_spark_df, randomstate_type


[docs]def multitable_train_test_split(
    dataset: List[Any],
    main_table_name: str,
    label_column_name: str,
    test_size: float = 0.25,
    random_state: randomstate_type = None,
) -> Tuple:
    """
    Splits X and y into random train and test subsets stratified by
    labels and protected attributes.

    Behaves similar to the `train_test_split`_ function from scikit-learn.

    .. _`train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

    Parameters
    ----------
    dataset : list of either Pandas or Spark dataframes

      Each dataframe in the list corresponds to an entity/table in the multi-table setting.

    main_table_name : string

      The name of the main table as the split is going to be based on the main table.

    label_column_name : string

      The name of the label column from the main table.

    test_size : float or int, default=0.25

      If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
      If int, represents the absolute number of test samples.

    random_state : int, RandomState instance or None, default=None

      Controls the shuffling applied to the data before applying the split.
      Pass an integer for reproducible output across multiple function calls.

      - None

          RandomState used by numpy.random

      - numpy.random.RandomState

          Use the provided random state, only affecting other users of that same random state instance.

      - integer

          Explicit seed.

    Returns
    -------
    result : tuple

      - item 0: train_X, List of datasets corresponding to the train split

      - item 1: test_X, List of datasets corresponding to the test split

      - item 2: train_y

      - item 3: test_y


    Raises
    ------
    jsonschema.ValueError
        Bad configuration.  Either the table name was not found, or te provided list does not contain
         spark or pandas dataframes
    """
    main_table_df = None
    index_of_main_table = -1
    for i, df in enumerate(dataset):
        if get_table_name(df) == main_table_name:
            main_table_df = df
            index_of_main_table = i
    if main_table_df is None:
        table_names = [get_table_name(df) for df in dataset]
        raise ValueError(
            f"Could not find {main_table_name} in the given dataset, the table names are {table_names}"
        )
    if _is_pandas_df(main_table_df):
        num_rows = len(main_table_df)
    elif _is_spark_df(main_table_df):
        # main_table_df = main_table_df.toPandas()
        num_rows = main_table_df.count()
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )
    if 0 < test_size < 1:
        num_test_rows = int(num_rows * test_size)
    else:
        num_test_rows = int(test_size)
    test_indices = random.choice(range(num_rows), num_test_rows, replace=False)
    train_indices = list(set([*range(num_rows)]) - set(test_indices.tolist()))
    assert len(test_indices) + len(train_indices) == num_rows
    train_dataset = list(dataset)
    test_dataset = list(dataset)
    if _is_pandas_df(main_table_df):
        train_main_df = main_table_df.iloc[train_indices]
        test_main_df = main_table_df.iloc[test_indices]
        train_y = train_main_df[label_column_name]
        test_y = test_main_df[label_column_name]
    elif _is_spark_df(main_table_df):
        spark_session = SparkSession.builder.appName(  # type: ignore
            "multitable_train_test_split"
        ).getOrCreate()
        train_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[train_indices]
        )
        test_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[test_indices]
        )
        train_y = train_main_df.select(label_column_name)
        test_y = test_main_df.select(label_column_name)
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )

    train_main_df = add_table_name(train_main_df, main_table_name)
    test_main_df = add_table_name(test_main_df, main_table_name)
    train_dataset[index_of_main_table] = train_main_df
    test_dataset[index_of_main_table] = test_main_df
    return train_dataset, test_dataset, train_y, test_y