Source code for lale.lib.aif360.datasets

# Copyright 2021-2023 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Fetcher methods to load fairness datasets and provide fairness_info for them.

See the notebook `demo_fairness_datasets`_ for an example for using
the functions, along with some tables and figures about them.
There is also an `arxiv paper`_ about these datasets.
Some of the fetcher methods have a `preprocess` argument that
defaults to False.
The notebook does not use that argument, instead demonstrating how
to do any required preprocessing in the context of a Lale pipeline.
Most of the datasets are from `OpenML`_, a few are from `meps.ahrq`_ or
`ProPublica`_, and most of the datasets have been used in various papers.
The Lale library does not distribute the datasets themselves, it only
provides methods for downloading them.

.. _`demo_fairness_datasets`: https://github.com/IBM/lale/blob/master/examples/demo_fairness_datasets.ipynb
.. _`arXiv paper`: https://arxiv.org/abs/2308.00133
.. _`OpenML`: https://www.openml.org/
.. _`meps.ahrq`: https://meps.ahrq.gov/data_stats/data_use.jsp
.. _`ProPublica`: https://github.com/propublica/compas-analysis
"""

import logging
import os
import typing
import urllib.request
from enum import Enum

import aif360
import aif360.datasets
import numpy as np
import pandas as pd
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import (
    load_preproc_data_compas,
)

import lale.datasets
import lale.datasets.openml
import lale.lib.aif360.util

logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)


[docs]def fetch_adult_df(preprocess: bool = False):
    """
    Fetch the `adult`_ dataset from OpenML and add `fairness_info`.
    It contains information about individuals from the 1994 U.S. census.
    The prediction task is a binary classification on whether the
    income of a person exceeds 50K a year. Without preprocessing,
    the dataset has 48,842 rows and 14 columns. There are two
    protected attributes, sex and race, and the disparate impact is
    0.23. The data includes both categorical and numeric columns, and
    has some missing values.

    .. _`adult`: https://www.openml.org/d/179

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      impute missing values;
      encode protected attributes in X as 0 or 1 to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate favorable outcomes;
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "adult", "classification", astype="pandas", preprocess=preprocess
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    if preprocess:
        sex = pd.Series(orig_X["sex_Male"] == 1, dtype=np.float64)
        race = pd.Series(orig_X["race_White"] == 1, dtype=np.float64)
        dropped_X = orig_X.drop(
            labels=[
                "race_Amer-Indian-Eskimo",
                "race_Asian-Pac-Islander",
                "race_Black",
                "race_Other",
                "race_White",
                "sex_Female",
                "sex_Male",
            ],
            axis=1,
        )
        encoded_X = dropped_X.assign(sex=sex, race=race)
        assert not encoded_X.isna().any().any()
        assert not orig_y.isna().any().any()
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {"feature": "sex", "reference_group": [1]},
                {"feature": "race", "reference_group": [1]},
            ],
        }
        return encoded_X, orig_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": [">50K"],
            "protected_attributes": [
                {"feature": "race", "reference_group": ["White"]},
                {"feature": "sex", "reference_group": ["Male"]},
            ],
        }
        return orig_X, orig_y, fairness_info


[docs]def fetch_bank_df(preprocess: bool = False):
    """
    Fetch the `bank-marketing`_ dataset from OpenML and add `fairness_info`.

    It contains information from marketing campaigns of a Portuguise
    bank.  The prediction task is a binary classification on whether
    the client will subscribe a term deposit. Without preprocessing,
    the dataset has 45,211 rows and 16 columns. There is one protected
    attribute, age, and the disparate impact of 0.84. The data
    includes both categorical and numeric columns, with no missing
    values.

    .. _`bank-marketing`: https://www.openml.org/d/1461

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate favorable outcomes;
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "bank-marketing", "classification", astype="pandas", preprocess=preprocess
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index().astype(np.float64)
    column_map = {
        "v1": "age",
        "v2": "job",
        "v3": "marital",
        "v4": "education",
        "v5": "default",
        "v6": "balance",
        "v7": "housing",
        "v8": "loan",
        "v9": "contact",
        "v10": "day",
        "v11": "month",
        "v12": "duration",
        "v13": "campaign",
        "v14": "pdays",
        "v15": "previous",
        "v16": "poutcome",
    }
    if preprocess:

        def map_col(col):
            if col.find("_") == -1:
                return column_map[col]
            prefix, suffix = col.split("_")
            return column_map[prefix] + "_" + suffix

        orig_X.columns = [map_col(col) for col in orig_X.columns]
        age = pd.Series(orig_X["age"] >= 25, dtype=np.float64)
        encoded_X = orig_X.assign(age=age)
        encoded_y = pd.Series(orig_y == 0, dtype=np.float64, name=orig_y.name)
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {"feature": "age", "reference_group": [1]},
            ],
        }
        return encoded_X, encoded_y, fairness_info
    else:
        orig_X.columns = [column_map[col] for col in orig_X.columns]
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {"feature": "age", "reference_group": [[25, 1000]]},
            ],
        }
        return orig_X, orig_y, fairness_info


[docs]def fetch_default_credit_df():
    """
    Fetch the `Default of Credit Card Clients Dataset`_ from OpenML and add `fairness_info`.
    It is a binary classification to predict whether the customer suffers
    a default in the next month (1) or not (0).
    The dataset has 30,000 rows and 24 columns, all numeric.
    The protected attribute is sex and the disparate impact is 0.957.

    .. _`Default of Credit Card Clients Dataset`: https://www.openml.org/d/43435

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "Default-of-Credit-Card-Clients-Dataset",
        "classification",
        astype="pandas",
        preprocess=False,
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    fairness_info = {
        "favorable_labels": [0],
        "protected_attributes": [
            {"feature": "sex", "reference_group": [2]},  # female
        ],
    }
    return orig_X, orig_y, fairness_info


[docs]def fetch_heart_disease_df():
    """
    Fetch the `heart-disease`_ dataset from OpenML and add `fairness_info`.
    It is a binary classification to predict heart disease from the
    Cleveland database, with 303 rows and 13 columns, all numeric.
    The protected attribute is age and the disparate impact is 0.589.

    .. _`heart-disease`: https://www.openml.org/d/43398

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "heart-disease", "classification", astype="pandas", preprocess=False
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    fairness_info = {
        "favorable_labels": [1],
        "protected_attributes": [
            {"feature": "age", "reference_group": [[0, 54]]},
        ],
    }
    return orig_X, orig_y, fairness_info


[docs]def fetch_law_school_df():
    """Fetch the `law school`_ dataset from OpenML and add `fairness_info`.
    This function returns both X and y unchanged, since the dataset
    was already binarized by the OpenML contributors, with the target
    of predicting whether the GPA is greater than 3.
    The protected attributes is race1 and the disparate impact is 0.704.
    The dataset has 20,800 rows and 11 columns (5 categorical and 6
    numeric columns).

    .. _`law school`: https://www.openml.org/d/43890

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.

    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "law-school-admission-bianry",
        "classification",
        astype="pandas",
        preprocess=False,
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    fairness_info = {
        "favorable_labels": ["TRUE"],
        "protected_attributes": [
            {"feature": "race1", "reference_group": ["white"]},
        ],
    }
    return orig_X, orig_y, fairness_info


[docs]def fetch_nlsy_df():
    """
    Fetch the `National Longitudinal Survey for the Youth (NLSY)`_ (also known as "University of Michigan Health and Retirement Study (HRS)") dataset from OpenML and add `fairness_info`.

    It is a binary classification to predict whether the income at a
    certain time exceeds a threshold, with 4,908 rows and 15 columns
    (comprising 6 categorical and 9 numerical columns).
    The protected attributes are age and gender and the disparate
    impact is 0.668.

    .. _`National Longitudinal Survey for the Youth (NLSY)`: https://www.openml.org/d/43892

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "national-longitudinal-survey-binary",
        "classification",
        astype="pandas",
        preprocess=False,
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    dropped_X = orig_X.drop(labels=["income96"], axis=1)
    fairness_info = {
        "favorable_labels": ["1"],
        "protected_attributes": [
            {"feature": "age", "reference_group": [[18, 120]]},
            {"feature": "gender", "reference_group": ["Male"]},
        ],
    }
    return dropped_X, orig_y, fairness_info


[docs]def fetch_student_math_df():
    """
    Fetch the `Student Performance (Math)`_ dataset from OpenML and add `fairness_info`.

    The original prediction target is a integer math grade from 1 to 20.
    This function returns X unchanged but with a binarized version of
    the target y, using 1 for values >=10 and 0 otherwise.
    The two protected attributes are sex and age
    and the disparate impact is 0.894.
    The dataset has 395 rows and 32 columns,
    including both categorical and numeric columns.

    .. _`Student Performance (Math)`: https://www.openml.org/d/42352

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "UCI-student-performance-mat", "regression", astype="pandas", preprocess=False
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    encoded_y = pd.Series(orig_y >= 12, dtype=np.float64, name="g3_ge_10")
    fairness_info = {
        "favorable_labels": [1],
        "protected_attributes": [
            {"feature": "sex", "reference_group": ["F"]},
            {"feature": "age", "reference_group": [[0, 17]]},
        ],
    }
    return orig_X, encoded_y, fairness_info


[docs]def fetch_student_por_df():
    """
    Fetch the `Student Performance (Portuguese)`_ dataset from OpenML and add `fairness_info`.

    The original prediction target is a integer Portuguese grade from 1 to 20.
    This function returns X unchanged but with a binarized version of
    the target y, using 1 for values >=10 and 0 otherwise.
    The two protected attributes are sex and age
    and the disparate impact is 0.858.
    The dataset has 649 rows and 32 columns,
    including both categorical and numeric columns.

    .. _`Student Performance (Portuguese)`: https://www.openml.org/d/42351

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "UCI-student-performance-por", "regression", astype="pandas", preprocess=False
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    encoded_y = pd.Series(orig_y >= 10, dtype=np.float64, name="g3_ge_10")
    fairness_info = {
        "favorable_labels": [1],
        "protected_attributes": [
            {"feature": "sex", "reference_group": ["F"]},
            {"feature": "age", "reference_group": [[0, 17]]},
        ],
    }
    return orig_X, encoded_y, fairness_info


[docs]def fetch_tae_df(preprocess: bool = False):
    """
    Fetch the `tae`_ dataset from OpenML and add `fairness_info`.

    It contains information from teaching assistant (TA) evaluations.
    at the University of Wisconsin--Madison.
    The prediction task is a classification on the type
    of rating a TA receives (1=Low, 2=Medium, 3=High). Without preprocessing,
    the dataset has 151 rows and 5 columns. There is one protected
    attributes, "whether_of_not_the_ta_is_a_native_english_speaker" [sic],
    and the disparate impact of 0.45. The data
    includes both categorical and numeric columns, with no missing
    values.

    .. _`tae`: https://www.openml.org/d/48

    Parameters
    ----------
    preprocess : boolean or "y", optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged group
      ("native_english_speaker");
      encode labels in y as 0 or 1 to indicate favorable outcomes;
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protecteded attributes.
      If "y", leave features X unchanged and only encode labels y as 0 or 1.
      If False, encode neither features X nor labels y.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "tae", "classification", astype="pandas", preprocess=(preprocess is True)
    )
    orig_X = pd.concat([train_X, test_X]).sort_index().astype(np.float64)
    orig_y = pd.concat([train_y, test_y]).sort_index().astype(np.float64)

    if preprocess is True:
        native_english_speaker = pd.Series(
            orig_X["whether_of_not_the_ta_is_a_native_english_speaker_1"] == 1,
            dtype=np.float64,
        )
        dropped_X = orig_X.drop(
            labels=[
                "whether_of_not_the_ta_is_a_native_english_speaker_1",
                "whether_of_not_the_ta_is_a_native_english_speaker_2",
            ],
            axis=1,
        )
        encoded_X = dropped_X.assign(native_english_speaker=native_english_speaker)
        encoded_y = pd.Series(orig_y == 2, dtype=np.float64)
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {"feature": "native_english_speaker", "reference_group": [1]},
            ],
        }
        return encoded_X, encoded_y, fairness_info
    elif preprocess == "y":
        encoded_y = pd.Series(orig_y == 2, dtype=np.float64)
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {
                    "feature": "whether_of_not_the_ta_is_a_native_english_speaker",
                    "reference_group": [1],
                },
            ],
        }
        return orig_X, encoded_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": [3],
            "protected_attributes": [
                {
                    "feature": "whether_of_not_the_ta_is_a_native_english_speaker",
                    "reference_group": [1],
                },
            ],
        }
        return orig_X, orig_y, fairness_info


[docs]def fetch_us_crime_df():
    """
    Fetch the `us_crime`_ (also known as "communities and crime") dataset from OpenML and add `fairness_info`.
    The original dataset has several columns with a large number of
    missing values, which this function drops.
    The binary protected attribute is blackgt6pct, which is derived by
    thresholding racepctblack > 0.06 and dropping the original racepctblack.
    The binary target is derived by thresholding its original y > 0.70.
    The disparate impact is 0.888.
    The resulting dataset has 1,994 rows and 102 columns,
    all but one of which are numeric.

    .. _`us_crime`: https://www.openml.org/d/315

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "us_crime", "regression", astype="pandas", preprocess=False
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    blackgt6pct = orig_X.racepctblack > 0.06
    to_drop = ["racepctblack"] + [c for c in orig_X.columns if orig_X[c].hasnans]
    dropped_X = orig_X.drop(labels=to_drop, axis=1)
    encoded_X = dropped_X.assign(blackgt6pct=blackgt6pct)
    encoded_y = pd.Series(orig_y >= 0.7, name="crimegt70pct")
    fairness_info = {
        "favorable_labels": [0],
        "protected_attributes": [{"feature": "blackgt6pct", "reference_group": [0]}],
    }
    return encoded_X, encoded_y, fairness_info


# COMPAS HELPERS
def _get_compas_filename(violent_recidivism=False):
    violent_tag = ""
    if violent_recidivism:
        violent_tag = "-violent"
    filename = f"compas-scores-two-years{violent_tag}.csv"
    return filename


def _get_compas_filepath(filename):
    directory = os.path.join(
        os.path.dirname(os.path.abspath(aif360.__file__)), "data", "raw", "compas"
    )
    return os.path.join(
        directory,
        filename,
    )


def _try_download_compas(violent_recidivism=False):
    filename = _get_compas_filename(violent_recidivism=violent_recidivism)
    filepath = _get_compas_filepath(filename)
    csv_exists = os.path.exists(filepath)
    if not csv_exists:
        # this request is to a string that begins with a hardcoded https url, so does not risk leaking local data
        urllib.request.urlretrieve(  # nosec
            f"https://raw.githubusercontent.com/propublica/compas-analysis/master/{filename}",
            filepath,
        )


def _get_pandas_and_fairness_info_from_compas_dataset(dataset):
    X, y = lale.lib.aif360.util.dataset_to_pandas(dataset)
    assert X is not None

    fairness_info = {
        "favorable_labels": [0],
        "protected_attributes": [
            {"feature": "sex", "reference_group": [1]},
            {"feature": "race", "reference_group": [1]},
        ],
    }
    return X, y, fairness_info


def _get_dataframe_from_compas_csv(violent_recidivism=False):
    filename = _get_compas_filename(violent_recidivism=violent_recidivism)
    filepath = _get_compas_filepath(filename)
    df: typing.Any = None
    try:
        df = pd.read_csv(filepath, index_col="id", na_values=[])
    except IOError as err:
        # In practice should not get here because of the _try_download_compas call above, but adding failure logic just in case
        logger.error(f"IOError: {err}")
        logger.error("To use this class, please download the following file:")
        logger.error(
            "\n\thttps://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
        )
        logger.error("\nand place it, as-is, in the folder:")
        logger.error(f"\n\t{os.path.abspath(os.path.dirname(filepath))}\n")
        import sys

        sys.exit(1)
    if violent_recidivism:
        # violent recidivism dataset includes extra label column for some reason
        df = pd.DataFrame(
            df, columns=[x for x in df.columns.tolist() if x != "two_year_recid.1"]
        ).sort_index()
    return df


def _perform_default_preprocessing(df):
    return df[
        (df.days_b_screening_arrest <= 30)
        & (df.days_b_screening_arrest >= -30)
        & (df.is_recid != -1)
        & (df.c_charge_degree != "O")
        & (df.score_text != "N/A")
    ]


def _perform_custom_preprocessing(df):
    """The custom pre-processing function is adapted from
    https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb
    """

    df = df[
        [
            "age",
            "c_charge_degree",
            "race",
            "age_cat",
            "score_text",
            "sex",
            "priors_count",
            "days_b_screening_arrest",
            "decile_score",
            "is_recid",
            "two_year_recid",
            "c_jail_in",
            "c_jail_out",
        ]
    ]

    # Indices of data samples to keep
    ix = df["days_b_screening_arrest"] <= 30
    ix = (df["days_b_screening_arrest"] >= -30) & ix
    ix = (df["is_recid"] != -1) & ix
    ix = (df["c_charge_degree"] != "O") & ix
    ix = (df["score_text"] != "N/A") & ix
    df = df.loc[ix, :]
    df["length_of_stay"] = (
        pd.to_datetime(df["c_jail_out"]) - pd.to_datetime(df["c_jail_in"])
    ).apply(lambda x: x.days)

    # Restrict races to African-American and Caucasian
    dfcut = df.loc[
        ~df["race"].isin(["Native American", "Hispanic", "Asian", "Other"]), :
    ]

    # Restrict the features to use
    dfcutQ = dfcut[
        [
            "sex",
            "race",
            "age_cat",
            "c_charge_degree",
            "score_text",
            "priors_count",
            "is_recid",
            "two_year_recid",
            "length_of_stay",
        ]
    ].copy()

    # Quantize priors count between 0, 1-3, and >3
    def quantizePrior(x):
        if x <= 0:
            return "0"
        elif 1 <= x <= 3:
            return "1 to 3"
        else:
            return "More than 3"

    # Quantize length of stay
    def quantizeLOS(x):
        if x <= 7:
            return "<week"
        if 8 < x <= 93:
            return "<3months"
        else:
            return ">3 months"

    # Quantize length of stay
    def adjustAge(x):
        if x == "25 - 45":
            return "25 to 45"
        else:
            return x

    # Quantize score_text to MediumHigh
    def quantizeScore(x):
        if (x == "High") | (x == "Medium"):
            return "MediumHigh"
        else:
            return x

    def group_race(x):
        if x == "Caucasian":
            return 1.0
        else:
            return 0.0

    dfcutQ["priors_count"] = dfcutQ["priors_count"].apply(quantizePrior)
    dfcutQ["length_of_stay"] = dfcutQ["length_of_stay"].apply(quantizeLOS)
    dfcutQ["score_text"] = dfcutQ["score_text"].apply(quantizeScore)
    dfcutQ["age_cat"] = dfcutQ["age_cat"].apply(adjustAge)

    # Recode sex and race
    dfcutQ["sex"] = dfcutQ["sex"].replace({"Female": 1.0, "Male": 0.0})
    dfcutQ["race"] = dfcutQ["race"].apply(group_race)

    features = [
        "two_year_recid",
        "sex",
        "race",
        "age_cat",
        "priors_count",
        "c_charge_degree",
    ]

    # Pass vallue to df
    df = dfcutQ[features]

    return df


def _get_pandas_and_fairness_info_from_compas_csv(violent_recidivism=False):
    df = _get_dataframe_from_compas_csv(violent_recidivism=violent_recidivism)
    # preprocessing steps performed by ProPublica team, even in the preprocess=False case
    df = _perform_default_preprocessing(df)
    X = pd.DataFrame(
        df, columns=[x for x in df.columns.tolist() if x != "two_year_recid"]
    ).sort_index()
    y = pd.Series(
        df["two_year_recid"], name="two_year_recid", dtype=np.float64
    ).sort_index()
    fairness_info = {
        "favorable_labels": [0],
        "protected_attributes": [
            {"feature": "sex", "reference_group": ["Female"]},
            {"feature": "race", "reference_group": ["Caucasian"]},
        ],
    }
    return X, y, fairness_info


[docs]def fetch_compas_df(preprocess: bool = False):
    """
    Fetch the `compas-two-years`_ dataset, also known as ProPublica recidivism, from GitHub and add `fairness_info`.

    It contains information about individuals with a binary
    classification for recidivism, indicating whether they were
    re-arrested within two years after the first arrest. Without
    preprocessing, the dataset has 6,172 rows and 51 columns.  There
    are two protected attributes, sex and race, and the disparate
    impact is 0.75.  The data includes numeric and categorical columns, with some
    missing values.

    .. _`compas-two-years`: https://github.com/propublica/compas-analysis

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups
      (1 if Female or Caucasian for the corresponding sex and race columns respectively);
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    violent_recidivism = False
    _try_download_compas(violent_recidivism=violent_recidivism)
    if preprocess:
        # Odd finding here: "Female" is a privileged class in the dataset, but the original
        # COMPAS algorithm actually predicted worse outcomes for that class after controlling
        # for other factors. Leaving it as "Female" for now (AIF360 does this by default as well)
        # but potentially worthy of revisiting.
        # See https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm
        # and https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
        # (hunch is that COMPAS was trained on more biased data that is not reproduced in ProPublica's dataset)
        dataset = load_preproc_data_compas()
        # above preprocessing results in a WARNING of "Missing Data: 5 rows removed from CompasDataset."
        # unclear how to resolve at the moment
        return _get_pandas_and_fairness_info_from_compas_dataset(dataset)
    else:
        return _get_pandas_and_fairness_info_from_compas_csv(
            violent_recidivism=violent_recidivism
        )


[docs]def fetch_compas_violent_df(preprocess: bool = False):
    """
    Fetch the `compas-two-years-violent`_ dataset, also known as ProPublica violent recidivism, from GitHub and add `fairness_info`.

    It contains information about individuals with a binary
    classification for violent recidivism, indicating whether they were
    re-arrested within two years after the first arrest. Without
    preprocessing, the dataset has 4,020 rows and 51 columns.  There
    are three protected attributes, sex, race, and age, and the disparate
    impact is 0.85.  The data includes numeric and categorical columns, with some
    missing values.

    .. _`compas-two-years-violent`: https://github.com/propublica/compas-analysis

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups
      (1 if Female, Caucasian, or at least 25 for the corresponding sex, race, and
      age columns respectively);
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    violent_recidivism = True
    _try_download_compas(violent_recidivism=violent_recidivism)
    if preprocess:
        # Odd finding here: "Female" is a privileged class in the dataset, but the original
        # COMPAS algorithm actually predicted worse outcomes for that class after controlling
        # for other factors. Leaving it as "Female" for now (AIF360 does this by default as well)
        # but potentially worthy of revisiting.
        # See https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm
        # and https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
        # (hunch is that COMPAS was trained on more biased data that is not reproduced in ProPublica's dataset)

        # Loading violent recidivism dataset using StandardDataset and default settings found in the CompasDataset
        # class since AIF360 lacks a violent recidivism dataset implementation
        df = _get_dataframe_from_compas_csv(violent_recidivism=violent_recidivism)
        default_mappings = {
            "label_maps": [{1.0: "Did recid.", 0.0: "No recid."}],
            "protected_attribute_maps": [
                {0.0: "Male", 1.0: "Female"},
                {1.0: "Caucasian", 0.0: "Not Caucasian"},
            ],
        }
        dataset = aif360.datasets.StandardDataset(
            df=df,
            label_name="two_year_recid",
            favorable_classes=[0],
            protected_attribute_names=["sex", "race"],
            privileged_classes=[[1.0], [1.0]],
            categorical_features=["age_cat", "priors_count", "c_charge_degree"],
            instance_weights_name=None,
            features_to_keep=[
                "sex",
                "age_cat",
                "race",
                "priors_count",
                "c_charge_degree",
                "two_year_recid",
            ],
            features_to_drop=[],
            na_values=[],
            custom_preprocessing=_perform_custom_preprocessing,
            metadata=default_mappings,
        )
        # above preprocessing results in a WARNING of "Missing Data: 5 rows removed from StandardDataset."
        # unclear how to resolve at the moment
        return _get_pandas_and_fairness_info_from_compas_dataset(dataset)
    else:
        return _get_pandas_and_fairness_info_from_compas_csv(
            violent_recidivism=violent_recidivism
        )


[docs]def fetch_creditg_df(preprocess: bool = False):
    """
    Fetch the `credit-g`_ dataset from OpenML and add `fairness_info`.

    It contains information about individuals with a binary
    classification into good or bad credit risks. Without
    preprocessing, the dataset has 1,000 rows and 20 columns. There
    are two protected attributs, personal_status/sex and age, and the
    disparate impact is 0.75.  The data includes both categorical and
    numeric columns, with no missing values.

    .. _`credit-g`: https://www.openml.org/d/31

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate favorable outcomes;
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "credit-g", "classification", astype="pandas", preprocess=preprocess
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    if preprocess:
        sex = pd.Series(
            (orig_X["personal_status_male div/sep"] == 1)
            | (orig_X["personal_status_male mar/wid"] == 1)
            | (orig_X["personal_status_male single"] == 1),
            dtype=np.float64,
        )
        age = pd.Series(orig_X["age"] > 25, dtype=np.float64)
        dropped_X = orig_X.drop(
            labels=[
                "personal_status_female div/dep/mar",
                "personal_status_male div/sep",
                "personal_status_male mar/wid",
                "personal_status_male single",
            ],
            axis=1,
        )
        encoded_X = dropped_X.assign(sex=sex, age=age)
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {"feature": "sex", "reference_group": [1]},
                {"feature": "age", "reference_group": [1]},
            ],
        }
        return encoded_X, orig_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": ["good"],
            "protected_attributes": [
                {
                    "feature": "personal_status",
                    "reference_group": [
                        "male div/sep",
                        "male mar/wid",
                        "male single",
                    ],
                },
                {"feature": "age", "reference_group": [[26, 1000]]},
            ],
        }
        return orig_X, orig_y, fairness_info


[docs]def fetch_ricci_df(preprocess: bool = False):
    """
    Fetch the `ricci_vs_destefano`_ dataset from OpenML and add `fairness_info`.

    It contains test scores for 2003 New Haven Fire Department
    promotion exams with a binary classification into promotion or no
    promotion.  Without preprocessing, the dataset has 118 rows and 5
    columns.  There is one protected attribute, race, and the
    disparate impact is 0.50.  The data includes both categorical and
    numeric columns, with no missing values.

    .. _`ricci_vs_destefano`: https://www.openml.org/d/42665

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate favorable outcomes;
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "ricci", "classification", astype="pandas", preprocess=preprocess
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    if preprocess:
        race = pd.Series(orig_X["race_W"] == 1, dtype=np.float64)
        dropped_X = orig_X.drop(labels=["race_B", "race_H", "race_W"], axis=1)
        encoded_X = dropped_X.assign(race=race)
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [{"feature": "race", "reference_group": [1]}],
        }
        return encoded_X, orig_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": ["Promotion"],
            "protected_attributes": [{"feature": "race", "reference_group": ["W"]}],
        }
        return orig_X, orig_y, fairness_info


[docs]def fetch_speeddating_df(preprocess: bool = False):
    """
    Fetch the `SpeedDating`_ dataset from OpenML and add `fairness_info`.

    It contains data gathered from participants in experimental speed dating events
    from 2002-2004 with a binary classification into match or no
    match.  Without preprocessing, the dataset has 8378 rows and 122
    columns.  There are two protected attributes, whether the other candidate has the same
    race and importance of having the same race, and the disparate impact
    is 0.85.  The data includes both categorical and
    numeric columns, with some missing values.

    .. _`SpeedDating`: https://www.openml.org/d/40536

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate favorable outcomes;
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "SpeedDating", "classification", astype="pandas", preprocess=preprocess
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    if preprocess:
        importance_same_race = pd.Series(
            orig_X["importance_same_race"] >= 9, dtype=np.float64
        )
        samerace = pd.Series(orig_X["samerace_1"] == 1, dtype=np.float64)
        # drop samerace-related columns
        columns_to_drop = ["samerace_0", "samerace_1"]

        # drop preprocessed columns

        def preprocessed_column_filter(x: str):
            return x.startswith("d_")

        columns_to_drop.extend(
            [x for x in orig_X.columns if preprocessed_column_filter(x)]
        )

        # drop has-null columns
        columns_to_drop.extend(["has_null_0", "has_null_1"])

        # drop decision columns

        def decision_column_filter(x: str):
            return x.startswith("decision")

        columns_to_drop.extend([x for x in orig_X.columns if decision_column_filter(x)])

        # drop field columns

        def field_column_filter(x: str):
            return x.startswith("field")

        columns_to_drop.extend([x for x in orig_X.columns if field_column_filter(x)])

        # drop wave column
        columns_to_drop.append("wave")
        dropped_X = orig_X.drop(labels=columns_to_drop, axis=1)
        encoded_X = dropped_X.assign(
            samerace=samerace, importance_same_race=importance_same_race
        )
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {"feature": "samerace", "reference_group": [1]},
                {"feature": "importance_same_race", "reference_group": [1]},
            ],
        }
        return encoded_X, orig_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": ["1"],
            "protected_attributes": [
                {"feature": "samerace", "reference_group": ["1"]},
                {"feature": "importance_same_race", "reference_group": [[9, 1000]]},
            ],
        }
        return orig_X, orig_y, fairness_info


def _fetch_boston_housing_df(preprocess: bool = False):
    """
    Fetch the `Boston housing`_ dataset from sklearn and add `fairness info`.

    It contains data about housing values in the suburbs of Boston with various
    features that can be used to perform regression. Without preprocessing,
    the dataset has 506 rows and 14 columns. There is one protected attribute,
    1000(Bk - 0.63)^2 where Bk is the proportion of Blacks by town, and the disparate
    impact is 0.5. The data includes only numeric columns, with no missing values.

    Hiding dataset from public consumption based on issues described at length `here`_

    .. _`Boston housing`: https://scikit-learn.org/0.20/datasets/index.html#boston-house-prices-dataset
    .. _`here`: https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attribute in X as 0 or 1 to indicate privileged groups.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.boston_housing_df(
        test_size=0.33
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    assert train_X is not None
    black_median = np.median(train_X["B"])
    label_median = np.median(train_y)

    if preprocess:
        # 1000(Bk - 0.63)^2 where Bk is the proportion of Blacks by town
        B = pd.Series(orig_X["B"] > black_median, dtype=np.float64)
        encoded_X = orig_X.assign(B=B)
        fairness_info = {
            "favorable_labels": [[-10000.0, label_median]],
            "protected_attributes": [
                {"feature": "B", "reference_group": [0]},
            ],
        }
        return encoded_X, orig_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": [[-10000.0, label_median]],
            "protected_attributes": [
                # 1000(Bk - 0.63)^2 where Bk is the proportion of Blacks by town
                {"feature": "B", "reference_group": [[0.0, black_median]]},
            ],
        }
        return orig_X, orig_y, fairness_info


[docs]def fetch_nursery_df(preprocess: bool = False):
    """
    Fetch the `nursery`_ dataset from OpenML and add `fairness_info`.

    It contains data gathered from applicants to public schools in
    Ljubljana, Slovenia during a competitive time period.
    Without preprocessing, the dataset has
    12960 rows and 8 columns.  There is one protected attribute, parents, and the
    disparate impact is 0.46.  The data has categorical columns (with
    numeric ones if preprocessing is applied), with no missing values.

    .. _`nursery`: https://www.openml.org/d/26

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "nursery", "classification", astype="pandas", preprocess=preprocess
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    if preprocess:
        parents = pd.Series(orig_X["parents_usual"] == 0, dtype=np.float64)
        dropped_X = orig_X.drop(
            labels=[
                "parents_great_pret",
                "parents_pretentious",
                "parents_usual",
            ],
            axis=1,
        )
        encoded_X = dropped_X.assign(parents=parents)
        # orig_y == 3 corresponds to "spec_prior"
        encoded_y = pd.Series((orig_y == 3), dtype=np.float64)
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [{"feature": "parents", "reference_group": [1]}],
        }
        return encoded_X, encoded_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": ["spec_prior"],
            "protected_attributes": [
                {
                    "feature": "parents",
                    "reference_group": ["great_pret", "pretentious"],
                }
            ],
        }
        return orig_X, orig_y, fairness_info


[docs]def fetch_titanic_df(preprocess: bool = False):
    """
    Fetch the `Titanic`_ dataset from OpenML and add `fairness_info`.

    It contains data gathered from passengers on the Titanic with a binary classification
    into "survived" or "did not survive".  Without preprocessing, the dataset has
    1309 rows and 13 columns.  There is one protected attribute, sex, and the
    disparate impact is 0.26.  The data includes both categorical and
    numeric columns, with some missing values.

    .. _`Titanic`: https://www.openml.org/d/40945

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attributes in X as 0 or 1 to indicate privileged groups;
      and apply one-hot encoding to any remaining features in X that
      are categorical and not protected attributes.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
        "titanic", "classification", astype="pandas", preprocess=preprocess
    )
    orig_X = pd.concat([train_X, test_X]).sort_index()
    orig_y = pd.concat([train_y, test_y]).sort_index()
    if preprocess:
        sex = pd.Series(orig_X["sex_female"] == 1, dtype=np.float64)
        columns_to_drop = ["sex_female", "sex_male"]

        # drop more columns that turn into gigantic one-hot encodings otherwise, like name and cabin

        def extra_categorical_columns_filter(c: str):
            return (
                c.startswith("name")
                or c.startswith("ticket")
                or c.startswith("cabin")
                or c.startswith("home.dest")
            )

        columns_to_drop.extend(
            [x for x in orig_X.columns if extra_categorical_columns_filter(x)]
        )
        dropped_X = orig_X.drop(labels=columns_to_drop, axis=1)
        encoded_X = dropped_X.assign(sex=sex)
        fairness_info = {
            "favorable_labels": [1],
            "protected_attributes": [
                {"feature": "sex", "reference_group": [1]},
            ],
        }
        return encoded_X, orig_y, fairness_info
    else:
        fairness_info = {
            "favorable_labels": ["1"],
            "protected_attributes": [
                {"feature": "sex", "reference_group": ["female"]},
            ],
        }
        return orig_X, orig_y, fairness_info


# MEPS HELPERS
class _MepsYear(Enum):
    FY2015 = 15
    FY2016 = 16


class _MepsPanel(Enum):
    PANEL19 = 19
    PANEL20 = 20
    PANEL21 = 21


def _race(row):
    if (row["HISPANX"] == 2) and (
        row["RACEV2X"] == 1
    ):  # non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
        return "White"
    return "Non-White"


def _get_utilization_columns(fiscal_year):
    return [
        f"OBTOTV{fiscal_year.value}",
        f"OPTOTV{fiscal_year.value}",
        f"ERTOT{fiscal_year.value}",
        f"IPNGTD{fiscal_year.value}",
        f"HHTOTD{fiscal_year.value}",
    ]


def _get_total_utilization(row, fiscal_year):
    cols = _get_utilization_columns(fiscal_year)
    return sum((row[x] for x in cols))


def _should_drop_column(x, fiscal_year):
    utilization_cols = set(_get_utilization_columns(fiscal_year))
    return x in utilization_cols


def _fetch_meps_raw_df(panel, fiscal_year):
    filename = ""
    if fiscal_year == _MepsYear.FY2015:
        assert panel in [_MepsPanel.PANEL19, _MepsPanel.PANEL20]
        filename = "h181.csv"
    elif fiscal_year == _MepsYear.FY2016:
        assert panel == _MepsPanel.PANEL21
        filename = "h192.csv"
    else:
        logger.error(f"Unexpected FiscalYear received: {fiscal_year}")
        raise ValueError(f"Unexpected FiscalYear received: {fiscal_year}")
    filepath = os.path.join(
        os.path.dirname(os.path.abspath(aif360.__file__)),
        "data",
        "raw",
        "meps",
        filename,
    )
    df: typing.Any = None
    try:
        df = pd.read_csv(filepath, sep=",", na_values=[])
    except IOError as err:
        logger.error(f"IOError: {err}")
        logger.error("To use this class, please follow the instructions found here:")
        logger.error(
            f"\n\t{'https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps'}\n"
        )
        logger.error(
            f"\n to download and convert the data and place the final {filename} file, as-is, in the folder:"
        )
        logger.error(f"\n\t{os.path.abspath(os.path.dirname(filepath))}\n")
        import sys

        sys.exit(1)

    df["RACEV2X"] = df.apply(_race, axis=1)
    df = df.rename(columns={"RACEV2X": "RACE"})
    df = df[df["PANEL"] == panel.value]

    df["TOTEXP15"] = df.apply(
        lambda row: _get_total_utilization(row, fiscal_year), axis=1
    )
    lessE = df["TOTEXP15"] < 10.0
    df.loc[lessE, "TOTEXP15"] = 0.0
    moreE = df["TOTEXP15"] >= 10.0
    df.loc[moreE, "TOTEXP15"] = 1.0

    df = df.rename(columns={"TOTEXP15": "UTILIZATION"})
    columns_to_drop = set(
        (x for x in df.columns.tolist() if _should_drop_column(x, fiscal_year))
    )
    df = df[sorted(set(df.columns.tolist()) - columns_to_drop, key=df.columns.get_loc)]
    X = pd.DataFrame(
        df, columns=[x for x in df.columns.tolist() if x != "UTILIZATION"]
    ).sort_index()
    y = pd.Series(df["UTILIZATION"], name="UTILIZATION").sort_index()
    fairness_info = {
        "favorable_labels": [1],
        "protected_attributes": [
            {"feature": "RACE", "reference_group": ["White"]},
        ],
    }

    return X, y, fairness_info


def _get_pandas_and_fairness_info_from_meps_dataset(dataset):
    X, y = lale.lib.aif360.util.dataset_to_pandas(dataset)
    fairness_info = {
        "favorable_labels": [1],
        "protected_attributes": [
            {"feature": "RACE", "reference_group": [1]},
        ],
    }
    return X, y, fairness_info


[docs]def fetch_meps_panel19_fy2015_df(preprocess: bool = False):
    """
    Fetch a subset of the `MEPS`_ dataset from aif360 and add fairness info.

    It contains information collected on a nationally representative sample
    of the civilian noninstitutionalized population of the United States,
    specifically reported medical expenditures and civilian demographics.
    This dataframe corresponds to data from panel 19 from the year 2015.
    Without preprocessing, the dataframe contains 16578 rows and 1825 columns.
    (With preprocessing the dataframe contains 15830 rows and 138 columns.)
    There is one protected attribute, race, and the disparate impact is 0.496
    if preprocessing is not applied and 0.490 if preprocessing is applied.
    The data includes numeric and categorical columns, with some missing values.

    Note: in order to use this dataset, be sure to follow the instructions
    found in the `AIF360 documentation`_ and accept the corresponding license agreement.

    .. _`MEPS`:  https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181
    .. _`AIF360 documentation`: https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attribute in X corresponding to race as 0 or 1
      to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate faborable outcomes;
      rename columns that are panel or round-specific;
      drop columns such as ID columns that are not relevant to the task at hand;
      and drop rows where features are unknown.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    if preprocess:
        dataset = aif360.datasets.MEPSDataset19()
        return _get_pandas_and_fairness_info_from_meps_dataset(dataset)
    else:
        return _fetch_meps_raw_df(_MepsPanel.PANEL19, _MepsYear.FY2015)


[docs]def fetch_meps_panel20_fy2015_df(preprocess: bool = False):
    """
    Fetch a subset of the `MEPS`_ dataset from aif360 and add fairness info.

    It contains information collected on a nationally representative sample
    of the civilian noninstitutionalized population of the United States,
    specifically reported medical expenditures and civilian demographics.
    This dataframe corresponds to data from panel 20 from the year 2015.
    Without preprocessing, the dataframe contains 18849 rows and 1825 columns.
    (With preprocessing the dataframe contains 17570 rows and 138 columns.)
    There is one protected attribute, race, and the disparate impact is 0.493
    if preprocessing is not applied and 0.488 if preprocessing is applied.
    The data includes numeric and categorical columns, with some missing values.

    Note: in order to use this dataset, be sure to follow the instructions
    found in the `AIF360 documentation`_ and accept the corresponding license agreement.

    .. _`MEPS`:  https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181
    .. _`AIF360 documentation`: https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attribute in X corresponding to race as 0 or 1
      to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate faborable outcomes;
      rename columns that are panel or round-specific;
      drop columns such as ID columns that are not relevant to the task at hand;
      and drop rows where features are unknown.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    if preprocess:
        dataset = aif360.datasets.MEPSDataset20()
        return _get_pandas_and_fairness_info_from_meps_dataset(dataset)
    else:
        return _fetch_meps_raw_df(_MepsPanel.PANEL20, _MepsYear.FY2015)


[docs]def fetch_meps_panel21_fy2016_df(preprocess: bool = False):
    """
    Fetch a subset of the `MEPS`_ dataset from aif360 and add fairness info.

    It contains information collected on a nationally representative sample
    of the civilian noninstitutionalized population of the United States,
    specifically reported medical expenditures and civilian demographics.
    This dataframe corresponds to data from panel 20 from the year 2016.
    Without preprocessing, the dataframe contains 17052 rows and 1936 columns.
    (With preprocessing the dataframe contains 15675 rows and 138 columns.)
    There is one protected attribute, race, and the disparate impact is 0.462
    if preprocessing is not applied and 0.451 if preprocessing is applied.
    The data includes numeric and categorical columns, with some missing values.

    Note: in order to use this dataset, be sure to follow the instructions
    found in the `AIF360 documentation`_ and accept the corresponding license agreement.

    .. _`MEPS`:  https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181
    .. _`AIF360 documentation`: https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps

    Parameters
    ----------
    preprocess : boolean, optional, default False

      If True,
      encode protected attribute in X corresponding to race as 0 or 1
      to indicate privileged groups;
      encode labels in y as 0 or 1 to indicate faborable outcomes;
      rename columns that are panel or round-specific;
      drop columns such as ID columns that are not relevant to the task at hand;
      and drop rows where features are unknown.

    Returns
    -------
    result : tuple

      - item 0: pandas Dataframe

          Features X, including both protected and non-protected attributes.

      - item 1: pandas Series

          Labels y.

      - item 3: fairness_info

          JSON meta-data following the format understood by fairness metrics
          and mitigation operators in `lale.lib.aif360`.
    """
    if preprocess:
        dataset = aif360.datasets.MEPSDataset21()
        return _get_pandas_and_fairness_info_from_meps_dataset(dataset)
    else:
        return _fetch_meps_raw_df(_MepsPanel.PANEL21, _MepsYear.FY2016)