Source code for lale.lib.aif360.datasets

# Copyright 2021-2023 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Fetcher methods to load fairness datasets and provide fairness_info for them.

See the notebook `demo_fairness_datasets`_ for an example for using
the functions, along with some tables and figures about them.
There is also an `arxiv paper`_ about these datasets.
Some of the fetcher methods have a `preprocess` argument that
defaults to False.
The notebook does not use that argument, instead demonstrating how
to do any required preprocessing in the context of a Lale pipeline.
Most of the datasets are from `OpenML`_, a few are from `meps.ahrq`_ or
`ProPublica`_, and most of the datasets have been used in various papers.
The Lale library does not distribute the datasets themselves, it only
provides methods for downloading them.

.. _`demo_fairness_datasets`: https://github.com/IBM/lale/blob/master/examples/demo_fairness_datasets.ipynb
.. _`arXiv paper`: https://arxiv.org/abs/2308.00133
.. _`OpenML`: https://www.openml.org/
.. _`meps.ahrq`: https://meps.ahrq.gov/data_stats/data_use.jsp
.. _`ProPublica`: https://github.com/propublica/compas-analysis
"""

import logging
import os
import typing
import urllib.request
from enum import Enum

import aif360
import aif360.datasets
import numpy as np
import pandas as pd
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import (
    load_preproc_data_compas,
)

import lale.datasets
import lale.datasets.openml
import lale.lib.aif360.util

logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)


[docs]def fetch_adult_df(preprocess: bool = False): """ Fetch the `adult`_ dataset from OpenML and add `fairness_info`. It contains information about individuals from the 1994 U.S. census. The prediction task is a binary classification on whether the income of a person exceeds 50K a year. Without preprocessing, the dataset has 48,842 rows and 14 columns. There are two protected attributes, sex and race, and the disparate impact is 0.23. The data includes both categorical and numeric columns, and has some missing values. .. _`adult`: https://www.openml.org/d/179 Parameters ---------- preprocess : boolean, optional, default False If True, impute missing values; encode protected attributes in X as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate favorable outcomes; and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "adult", "classification", astype="pandas", preprocess=preprocess ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() if preprocess: sex = pd.Series(orig_X["sex_Male"] == 1, dtype=np.float64) race = pd.Series(orig_X["race_White"] == 1, dtype=np.float64) dropped_X = orig_X.drop( labels=[ "race_Amer-Indian-Eskimo", "race_Asian-Pac-Islander", "race_Black", "race_Other", "race_White", "sex_Female", "sex_Male", ], axis=1, ) encoded_X = dropped_X.assign(sex=sex, race=race) assert not encoded_X.isna().any().any() assert not orig_y.isna().any().any() fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "sex", "reference_group": [1]}, {"feature": "race", "reference_group": [1]}, ], } return encoded_X, orig_y, fairness_info else: fairness_info = { "favorable_labels": [">50K"], "protected_attributes": [ {"feature": "race", "reference_group": ["White"]}, {"feature": "sex", "reference_group": ["Male"]}, ], } return orig_X, orig_y, fairness_info
[docs]def fetch_bank_df(preprocess: bool = False): """ Fetch the `bank-marketing`_ dataset from OpenML and add `fairness_info`. It contains information from marketing campaigns of a Portuguise bank. The prediction task is a binary classification on whether the client will subscribe a term deposit. Without preprocessing, the dataset has 45,211 rows and 16 columns. There is one protected attribute, age, and the disparate impact of 0.84. The data includes both categorical and numeric columns, with no missing values. .. _`bank-marketing`: https://www.openml.org/d/1461 Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate favorable outcomes; and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "bank-marketing", "classification", astype="pandas", preprocess=preprocess ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index().astype(np.float64) column_map = { "v1": "age", "v2": "job", "v3": "marital", "v4": "education", "v5": "default", "v6": "balance", "v7": "housing", "v8": "loan", "v9": "contact", "v10": "day", "v11": "month", "v12": "duration", "v13": "campaign", "v14": "pdays", "v15": "previous", "v16": "poutcome", } if preprocess: def map_col(col): if col.find("_") == -1: return column_map[col] prefix, suffix = col.split("_") return column_map[prefix] + "_" + suffix orig_X.columns = [map_col(col) for col in orig_X.columns] age = pd.Series(orig_X["age"] >= 25, dtype=np.float64) encoded_X = orig_X.assign(age=age) encoded_y = pd.Series(orig_y == 0, dtype=np.float64, name=orig_y.name) fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "age", "reference_group": [1]}, ], } return encoded_X, encoded_y, fairness_info else: orig_X.columns = [column_map[col] for col in orig_X.columns] fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "age", "reference_group": [[25, 1000]]}, ], } return orig_X, orig_y, fairness_info
[docs]def fetch_default_credit_df(): """ Fetch the `Default of Credit Card Clients Dataset`_ from OpenML and add `fairness_info`. It is a binary classification to predict whether the customer suffers a default in the next month (1) or not (0). The dataset has 30,000 rows and 24 columns, all numeric. The protected attribute is sex and the disparate impact is 0.957. .. _`Default of Credit Card Clients Dataset`: https://www.openml.org/d/43435 Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "Default-of-Credit-Card-Clients-Dataset", "classification", astype="pandas", preprocess=False, ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() fairness_info = { "favorable_labels": [0], "protected_attributes": [ {"feature": "sex", "reference_group": [2]}, # female ], } return orig_X, orig_y, fairness_info
[docs]def fetch_heart_disease_df(): """ Fetch the `heart-disease`_ dataset from OpenML and add `fairness_info`. It is a binary classification to predict heart disease from the Cleveland database, with 303 rows and 13 columns, all numeric. The protected attribute is age and the disparate impact is 0.589. .. _`heart-disease`: https://www.openml.org/d/43398 Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "heart-disease", "classification", astype="pandas", preprocess=False ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "age", "reference_group": [[0, 54]]}, ], } return orig_X, orig_y, fairness_info
[docs]def fetch_law_school_df(): """Fetch the `law school`_ dataset from OpenML and add `fairness_info`. This function returns both X and y unchanged, since the dataset was already binarized by the OpenML contributors, with the target of predicting whether the GPA is greater than 3. The protected attributes is race1 and the disparate impact is 0.704. The dataset has 20,800 rows and 11 columns (5 categorical and 6 numeric columns). .. _`law school`: https://www.openml.org/d/43890 Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "law-school-admission-bianry", "classification", astype="pandas", preprocess=False, ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() fairness_info = { "favorable_labels": ["TRUE"], "protected_attributes": [ {"feature": "race1", "reference_group": ["white"]}, ], } return orig_X, orig_y, fairness_info
[docs]def fetch_nlsy_df(): """ Fetch the `National Longitudinal Survey for the Youth (NLSY)`_ (also known as "University of Michigan Health and Retirement Study (HRS)") dataset from OpenML and add `fairness_info`. It is a binary classification to predict whether the income at a certain time exceeds a threshold, with 4,908 rows and 15 columns (comprising 6 categorical and 9 numerical columns). The protected attributes are age and gender and the disparate impact is 0.668. .. _`National Longitudinal Survey for the Youth (NLSY)`: https://www.openml.org/d/43892 Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "national-longitudinal-survey-binary", "classification", astype="pandas", preprocess=False, ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() dropped_X = orig_X.drop(labels=["income96"], axis=1) fairness_info = { "favorable_labels": ["1"], "protected_attributes": [ {"feature": "age", "reference_group": [[18, 120]]}, {"feature": "gender", "reference_group": ["Male"]}, ], } return dropped_X, orig_y, fairness_info
[docs]def fetch_student_math_df(): """ Fetch the `Student Performance (Math)`_ dataset from OpenML and add `fairness_info`. The original prediction target is a integer math grade from 1 to 20. This function returns X unchanged but with a binarized version of the target y, using 1 for values >=10 and 0 otherwise. The two protected attributes are sex and age and the disparate impact is 0.894. The dataset has 395 rows and 32 columns, including both categorical and numeric columns. .. _`Student Performance (Math)`: https://www.openml.org/d/42352 Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "UCI-student-performance-mat", "regression", astype="pandas", preprocess=False ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() encoded_y = pd.Series(orig_y >= 12, dtype=np.float64, name="g3_ge_10") fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "sex", "reference_group": ["F"]}, {"feature": "age", "reference_group": [[0, 17]]}, ], } return orig_X, encoded_y, fairness_info
[docs]def fetch_student_por_df(): """ Fetch the `Student Performance (Portuguese)`_ dataset from OpenML and add `fairness_info`. The original prediction target is a integer Portuguese grade from 1 to 20. This function returns X unchanged but with a binarized version of the target y, using 1 for values >=10 and 0 otherwise. The two protected attributes are sex and age and the disparate impact is 0.858. The dataset has 649 rows and 32 columns, including both categorical and numeric columns. .. _`Student Performance (Portuguese)`: https://www.openml.org/d/42351 Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "UCI-student-performance-por", "regression", astype="pandas", preprocess=False ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() encoded_y = pd.Series(orig_y >= 10, dtype=np.float64, name="g3_ge_10") fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "sex", "reference_group": ["F"]}, {"feature": "age", "reference_group": [[0, 17]]}, ], } return orig_X, encoded_y, fairness_info
[docs]def fetch_tae_df(preprocess: bool = False): """ Fetch the `tae`_ dataset from OpenML and add `fairness_info`. It contains information from teaching assistant (TA) evaluations. at the University of Wisconsin--Madison. The prediction task is a classification on the type of rating a TA receives (1=Low, 2=Medium, 3=High). Without preprocessing, the dataset has 151 rows and 5 columns. There is one protected attributes, "whether_of_not_the_ta_is_a_native_english_speaker" [sic], and the disparate impact of 0.45. The data includes both categorical and numeric columns, with no missing values. .. _`tae`: https://www.openml.org/d/48 Parameters ---------- preprocess : boolean or "y", optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged group ("native_english_speaker"); encode labels in y as 0 or 1 to indicate favorable outcomes; and apply one-hot encoding to any remaining features in X that are categorical and not protecteded attributes. If "y", leave features X unchanged and only encode labels y as 0 or 1. If False, encode neither features X nor labels y. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "tae", "classification", astype="pandas", preprocess=(preprocess is True) ) orig_X = pd.concat([train_X, test_X]).sort_index().astype(np.float64) orig_y = pd.concat([train_y, test_y]).sort_index().astype(np.float64) if preprocess is True: native_english_speaker = pd.Series( orig_X["whether_of_not_the_ta_is_a_native_english_speaker_1"] == 1, dtype=np.float64, ) dropped_X = orig_X.drop( labels=[ "whether_of_not_the_ta_is_a_native_english_speaker_1", "whether_of_not_the_ta_is_a_native_english_speaker_2", ], axis=1, ) encoded_X = dropped_X.assign(native_english_speaker=native_english_speaker) encoded_y = pd.Series(orig_y == 2, dtype=np.float64) fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "native_english_speaker", "reference_group": [1]}, ], } return encoded_X, encoded_y, fairness_info elif preprocess == "y": encoded_y = pd.Series(orig_y == 2, dtype=np.float64) fairness_info = { "favorable_labels": [1], "protected_attributes": [ { "feature": "whether_of_not_the_ta_is_a_native_english_speaker", "reference_group": [1], }, ], } return orig_X, encoded_y, fairness_info else: fairness_info = { "favorable_labels": [3], "protected_attributes": [ { "feature": "whether_of_not_the_ta_is_a_native_english_speaker", "reference_group": [1], }, ], } return orig_X, orig_y, fairness_info
[docs]def fetch_us_crime_df(): """ Fetch the `us_crime`_ (also known as "communities and crime") dataset from OpenML and add `fairness_info`. The original dataset has several columns with a large number of missing values, which this function drops. The binary protected attribute is blackgt6pct, which is derived by thresholding racepctblack > 0.06 and dropping the original racepctblack. The binary target is derived by thresholding its original y > 0.70. The disparate impact is 0.888. The resulting dataset has 1,994 rows and 102 columns, all but one of which are numeric. .. _`us_crime`: https://www.openml.org/d/315 Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "us_crime", "regression", astype="pandas", preprocess=False ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() blackgt6pct = orig_X.racepctblack > 0.06 to_drop = ["racepctblack"] + [c for c in orig_X.columns if orig_X[c].hasnans] dropped_X = orig_X.drop(labels=to_drop, axis=1) encoded_X = dropped_X.assign(blackgt6pct=blackgt6pct) encoded_y = pd.Series(orig_y >= 0.7, name="crimegt70pct") fairness_info = { "favorable_labels": [0], "protected_attributes": [{"feature": "blackgt6pct", "reference_group": [0]}], } return encoded_X, encoded_y, fairness_info
# COMPAS HELPERS def _get_compas_filename(violent_recidivism=False): violent_tag = "" if violent_recidivism: violent_tag = "-violent" filename = f"compas-scores-two-years{violent_tag}.csv" return filename def _get_compas_filepath(filename): directory = os.path.join( os.path.dirname(os.path.abspath(aif360.__file__)), "data", "raw", "compas" ) return os.path.join( directory, filename, ) def _try_download_compas(violent_recidivism=False): filename = _get_compas_filename(violent_recidivism=violent_recidivism) filepath = _get_compas_filepath(filename) csv_exists = os.path.exists(filepath) if not csv_exists: # this request is to a string that begins with a hardcoded https url, so does not risk leaking local data urllib.request.urlretrieve( # nosec f"https://raw.githubusercontent.com/propublica/compas-analysis/master/{filename}", filepath, ) def _get_pandas_and_fairness_info_from_compas_dataset(dataset): X, y = lale.lib.aif360.util.dataset_to_pandas(dataset) assert X is not None fairness_info = { "favorable_labels": [0], "protected_attributes": [ {"feature": "sex", "reference_group": [1]}, {"feature": "race", "reference_group": [1]}, ], } return X, y, fairness_info def _get_dataframe_from_compas_csv(violent_recidivism=False): filename = _get_compas_filename(violent_recidivism=violent_recidivism) filepath = _get_compas_filepath(filename) df: typing.Any = None try: df = pd.read_csv(filepath, index_col="id", na_values=[]) except IOError as err: # In practice should not get here because of the _try_download_compas call above, but adding failure logic just in case logger.error(f"IOError: {err}") logger.error("To use this class, please download the following file:") logger.error( "\n\thttps://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv" ) logger.error("\nand place it, as-is, in the folder:") logger.error(f"\n\t{os.path.abspath(os.path.dirname(filepath))}\n") import sys sys.exit(1) if violent_recidivism: # violent recidivism dataset includes extra label column for some reason df = pd.DataFrame( df, columns=[x for x in df.columns.tolist() if x != "two_year_recid.1"] ).sort_index() return df def _perform_default_preprocessing(df): return df[ (df.days_b_screening_arrest <= 30) & (df.days_b_screening_arrest >= -30) & (df.is_recid != -1) & (df.c_charge_degree != "O") & (df.score_text != "N/A") ] def _perform_custom_preprocessing(df): """The custom pre-processing function is adapted from https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb """ df = df[ [ "age", "c_charge_degree", "race", "age_cat", "score_text", "sex", "priors_count", "days_b_screening_arrest", "decile_score", "is_recid", "two_year_recid", "c_jail_in", "c_jail_out", ] ] # Indices of data samples to keep ix = df["days_b_screening_arrest"] <= 30 ix = (df["days_b_screening_arrest"] >= -30) & ix ix = (df["is_recid"] != -1) & ix ix = (df["c_charge_degree"] != "O") & ix ix = (df["score_text"] != "N/A") & ix df = df.loc[ix, :] df["length_of_stay"] = ( pd.to_datetime(df["c_jail_out"]) - pd.to_datetime(df["c_jail_in"]) ).apply(lambda x: x.days) # Restrict races to African-American and Caucasian dfcut = df.loc[ ~df["race"].isin(["Native American", "Hispanic", "Asian", "Other"]), : ] # Restrict the features to use dfcutQ = dfcut[ [ "sex", "race", "age_cat", "c_charge_degree", "score_text", "priors_count", "is_recid", "two_year_recid", "length_of_stay", ] ].copy() # Quantize priors count between 0, 1-3, and >3 def quantizePrior(x): if x <= 0: return "0" elif 1 <= x <= 3: return "1 to 3" else: return "More than 3" # Quantize length of stay def quantizeLOS(x): if x <= 7: return "<week" if 8 < x <= 93: return "<3months" else: return ">3 months" # Quantize length of stay def adjustAge(x): if x == "25 - 45": return "25 to 45" else: return x # Quantize score_text to MediumHigh def quantizeScore(x): if (x == "High") | (x == "Medium"): return "MediumHigh" else: return x def group_race(x): if x == "Caucasian": return 1.0 else: return 0.0 dfcutQ["priors_count"] = dfcutQ["priors_count"].apply(quantizePrior) dfcutQ["length_of_stay"] = dfcutQ["length_of_stay"].apply(quantizeLOS) dfcutQ["score_text"] = dfcutQ["score_text"].apply(quantizeScore) dfcutQ["age_cat"] = dfcutQ["age_cat"].apply(adjustAge) # Recode sex and race dfcutQ["sex"] = dfcutQ["sex"].replace({"Female": 1.0, "Male": 0.0}) dfcutQ["race"] = dfcutQ["race"].apply(group_race) features = [ "two_year_recid", "sex", "race", "age_cat", "priors_count", "c_charge_degree", ] # Pass vallue to df df = dfcutQ[features] return df def _get_pandas_and_fairness_info_from_compas_csv(violent_recidivism=False): df = _get_dataframe_from_compas_csv(violent_recidivism=violent_recidivism) # preprocessing steps performed by ProPublica team, even in the preprocess=False case df = _perform_default_preprocessing(df) X = pd.DataFrame( df, columns=[x for x in df.columns.tolist() if x != "two_year_recid"] ).sort_index() y = pd.Series( df["two_year_recid"], name="two_year_recid", dtype=np.float64 ).sort_index() fairness_info = { "favorable_labels": [0], "protected_attributes": [ {"feature": "sex", "reference_group": ["Female"]}, {"feature": "race", "reference_group": ["Caucasian"]}, ], } return X, y, fairness_info
[docs]def fetch_compas_df(preprocess: bool = False): """ Fetch the `compas-two-years`_ dataset, also known as ProPublica recidivism, from GitHub and add `fairness_info`. It contains information about individuals with a binary classification for recidivism, indicating whether they were re-arrested within two years after the first arrest. Without preprocessing, the dataset has 6,172 rows and 51 columns. There are two protected attributes, sex and race, and the disparate impact is 0.75. The data includes numeric and categorical columns, with some missing values. .. _`compas-two-years`: https://github.com/propublica/compas-analysis Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups (1 if Female or Caucasian for the corresponding sex and race columns respectively); and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ violent_recidivism = False _try_download_compas(violent_recidivism=violent_recidivism) if preprocess: # Odd finding here: "Female" is a privileged class in the dataset, but the original # COMPAS algorithm actually predicted worse outcomes for that class after controlling # for other factors. Leaving it as "Female" for now (AIF360 does this by default as well) # but potentially worthy of revisiting. # See https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm # and https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb # (hunch is that COMPAS was trained on more biased data that is not reproduced in ProPublica's dataset) dataset = load_preproc_data_compas() # above preprocessing results in a WARNING of "Missing Data: 5 rows removed from CompasDataset." # unclear how to resolve at the moment return _get_pandas_and_fairness_info_from_compas_dataset(dataset) else: return _get_pandas_and_fairness_info_from_compas_csv( violent_recidivism=violent_recidivism )
[docs]def fetch_compas_violent_df(preprocess: bool = False): """ Fetch the `compas-two-years-violent`_ dataset, also known as ProPublica violent recidivism, from GitHub and add `fairness_info`. It contains information about individuals with a binary classification for violent recidivism, indicating whether they were re-arrested within two years after the first arrest. Without preprocessing, the dataset has 4,020 rows and 51 columns. There are three protected attributes, sex, race, and age, and the disparate impact is 0.85. The data includes numeric and categorical columns, with some missing values. .. _`compas-two-years-violent`: https://github.com/propublica/compas-analysis Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups (1 if Female, Caucasian, or at least 25 for the corresponding sex, race, and age columns respectively); and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ violent_recidivism = True _try_download_compas(violent_recidivism=violent_recidivism) if preprocess: # Odd finding here: "Female" is a privileged class in the dataset, but the original # COMPAS algorithm actually predicted worse outcomes for that class after controlling # for other factors. Leaving it as "Female" for now (AIF360 does this by default as well) # but potentially worthy of revisiting. # See https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm # and https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb # (hunch is that COMPAS was trained on more biased data that is not reproduced in ProPublica's dataset) # Loading violent recidivism dataset using StandardDataset and default settings found in the CompasDataset # class since AIF360 lacks a violent recidivism dataset implementation df = _get_dataframe_from_compas_csv(violent_recidivism=violent_recidivism) default_mappings = { "label_maps": [{1.0: "Did recid.", 0.0: "No recid."}], "protected_attribute_maps": [ {0.0: "Male", 1.0: "Female"}, {1.0: "Caucasian", 0.0: "Not Caucasian"}, ], } dataset = aif360.datasets.StandardDataset( df=df, label_name="two_year_recid", favorable_classes=[0], protected_attribute_names=["sex", "race"], privileged_classes=[[1.0], [1.0]], categorical_features=["age_cat", "priors_count", "c_charge_degree"], instance_weights_name=None, features_to_keep=[ "sex", "age_cat", "race", "priors_count", "c_charge_degree", "two_year_recid", ], features_to_drop=[], na_values=[], custom_preprocessing=_perform_custom_preprocessing, metadata=default_mappings, ) # above preprocessing results in a WARNING of "Missing Data: 5 rows removed from StandardDataset." # unclear how to resolve at the moment return _get_pandas_and_fairness_info_from_compas_dataset(dataset) else: return _get_pandas_and_fairness_info_from_compas_csv( violent_recidivism=violent_recidivism )
[docs]def fetch_creditg_df(preprocess: bool = False): """ Fetch the `credit-g`_ dataset from OpenML and add `fairness_info`. It contains information about individuals with a binary classification into good or bad credit risks. Without preprocessing, the dataset has 1,000 rows and 20 columns. There are two protected attributs, personal_status/sex and age, and the disparate impact is 0.75. The data includes both categorical and numeric columns, with no missing values. .. _`credit-g`: https://www.openml.org/d/31 Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate favorable outcomes; and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "credit-g", "classification", astype="pandas", preprocess=preprocess ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() if preprocess: sex = pd.Series( (orig_X["personal_status_male div/sep"] == 1) | (orig_X["personal_status_male mar/wid"] == 1) | (orig_X["personal_status_male single"] == 1), dtype=np.float64, ) age = pd.Series(orig_X["age"] > 25, dtype=np.float64) dropped_X = orig_X.drop( labels=[ "personal_status_female div/dep/mar", "personal_status_male div/sep", "personal_status_male mar/wid", "personal_status_male single", ], axis=1, ) encoded_X = dropped_X.assign(sex=sex, age=age) fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "sex", "reference_group": [1]}, {"feature": "age", "reference_group": [1]}, ], } return encoded_X, orig_y, fairness_info else: fairness_info = { "favorable_labels": ["good"], "protected_attributes": [ { "feature": "personal_status", "reference_group": [ "male div/sep", "male mar/wid", "male single", ], }, {"feature": "age", "reference_group": [[26, 1000]]}, ], } return orig_X, orig_y, fairness_info
[docs]def fetch_ricci_df(preprocess: bool = False): """ Fetch the `ricci_vs_destefano`_ dataset from OpenML and add `fairness_info`. It contains test scores for 2003 New Haven Fire Department promotion exams with a binary classification into promotion or no promotion. Without preprocessing, the dataset has 118 rows and 5 columns. There is one protected attribute, race, and the disparate impact is 0.50. The data includes both categorical and numeric columns, with no missing values. .. _`ricci_vs_destefano`: https://www.openml.org/d/42665 Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate favorable outcomes; and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "ricci", "classification", astype="pandas", preprocess=preprocess ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() if preprocess: race = pd.Series(orig_X["race_W"] == 1, dtype=np.float64) dropped_X = orig_X.drop(labels=["race_B", "race_H", "race_W"], axis=1) encoded_X = dropped_X.assign(race=race) fairness_info = { "favorable_labels": [1], "protected_attributes": [{"feature": "race", "reference_group": [1]}], } return encoded_X, orig_y, fairness_info else: fairness_info = { "favorable_labels": ["Promotion"], "protected_attributes": [{"feature": "race", "reference_group": ["W"]}], } return orig_X, orig_y, fairness_info
[docs]def fetch_speeddating_df(preprocess: bool = False): """ Fetch the `SpeedDating`_ dataset from OpenML and add `fairness_info`. It contains data gathered from participants in experimental speed dating events from 2002-2004 with a binary classification into match or no match. Without preprocessing, the dataset has 8378 rows and 122 columns. There are two protected attributes, whether the other candidate has the same race and importance of having the same race, and the disparate impact is 0.85. The data includes both categorical and numeric columns, with some missing values. .. _`SpeedDating`: https://www.openml.org/d/40536 Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate favorable outcomes; and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "SpeedDating", "classification", astype="pandas", preprocess=preprocess ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() if preprocess: importance_same_race = pd.Series( orig_X["importance_same_race"] >= 9, dtype=np.float64 ) samerace = pd.Series(orig_X["samerace_1"] == 1, dtype=np.float64) # drop samerace-related columns columns_to_drop = ["samerace_0", "samerace_1"] # drop preprocessed columns def preprocessed_column_filter(x: str): return x.startswith("d_") columns_to_drop.extend( [x for x in orig_X.columns if preprocessed_column_filter(x)] ) # drop has-null columns columns_to_drop.extend(["has_null_0", "has_null_1"]) # drop decision columns def decision_column_filter(x: str): return x.startswith("decision") columns_to_drop.extend([x for x in orig_X.columns if decision_column_filter(x)]) # drop field columns def field_column_filter(x: str): return x.startswith("field") columns_to_drop.extend([x for x in orig_X.columns if field_column_filter(x)]) # drop wave column columns_to_drop.append("wave") dropped_X = orig_X.drop(labels=columns_to_drop, axis=1) encoded_X = dropped_X.assign( samerace=samerace, importance_same_race=importance_same_race ) fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "samerace", "reference_group": [1]}, {"feature": "importance_same_race", "reference_group": [1]}, ], } return encoded_X, orig_y, fairness_info else: fairness_info = { "favorable_labels": ["1"], "protected_attributes": [ {"feature": "samerace", "reference_group": ["1"]}, {"feature": "importance_same_race", "reference_group": [[9, 1000]]}, ], } return orig_X, orig_y, fairness_info
def _fetch_boston_housing_df(preprocess: bool = False): """ Fetch the `Boston housing`_ dataset from sklearn and add `fairness info`. It contains data about housing values in the suburbs of Boston with various features that can be used to perform regression. Without preprocessing, the dataset has 506 rows and 14 columns. There is one protected attribute, 1000(Bk - 0.63)^2 where Bk is the proportion of Blacks by town, and the disparate impact is 0.5. The data includes only numeric columns, with no missing values. Hiding dataset from public consumption based on issues described at length `here`_ .. _`Boston housing`: https://scikit-learn.org/0.20/datasets/index.html#boston-house-prices-dataset .. _`here`: https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8 Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attribute in X as 0 or 1 to indicate privileged groups. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.boston_housing_df( test_size=0.33 ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() assert train_X is not None black_median = np.median(train_X["B"]) label_median = np.median(train_y) if preprocess: # 1000(Bk - 0.63)^2 where Bk is the proportion of Blacks by town B = pd.Series(orig_X["B"] > black_median, dtype=np.float64) encoded_X = orig_X.assign(B=B) fairness_info = { "favorable_labels": [[-10000.0, label_median]], "protected_attributes": [ {"feature": "B", "reference_group": [0]}, ], } return encoded_X, orig_y, fairness_info else: fairness_info = { "favorable_labels": [[-10000.0, label_median]], "protected_attributes": [ # 1000(Bk - 0.63)^2 where Bk is the proportion of Blacks by town {"feature": "B", "reference_group": [[0.0, black_median]]}, ], } return orig_X, orig_y, fairness_info
[docs]def fetch_nursery_df(preprocess: bool = False): """ Fetch the `nursery`_ dataset from OpenML and add `fairness_info`. It contains data gathered from applicants to public schools in Ljubljana, Slovenia during a competitive time period. Without preprocessing, the dataset has 12960 rows and 8 columns. There is one protected attribute, parents, and the disparate impact is 0.46. The data has categorical columns (with numeric ones if preprocessing is applied), with no missing values. .. _`nursery`: https://www.openml.org/d/26 Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "nursery", "classification", astype="pandas", preprocess=preprocess ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() if preprocess: parents = pd.Series(orig_X["parents_usual"] == 0, dtype=np.float64) dropped_X = orig_X.drop( labels=[ "parents_great_pret", "parents_pretentious", "parents_usual", ], axis=1, ) encoded_X = dropped_X.assign(parents=parents) # orig_y == 3 corresponds to "spec_prior" encoded_y = pd.Series((orig_y == 3), dtype=np.float64) fairness_info = { "favorable_labels": [1], "protected_attributes": [{"feature": "parents", "reference_group": [1]}], } return encoded_X, encoded_y, fairness_info else: fairness_info = { "favorable_labels": ["spec_prior"], "protected_attributes": [ { "feature": "parents", "reference_group": ["great_pret", "pretentious"], } ], } return orig_X, orig_y, fairness_info
[docs]def fetch_titanic_df(preprocess: bool = False): """ Fetch the `Titanic`_ dataset from OpenML and add `fairness_info`. It contains data gathered from passengers on the Titanic with a binary classification into "survived" or "did not survive". Without preprocessing, the dataset has 1309 rows and 13 columns. There is one protected attribute, sex, and the disparate impact is 0.26. The data includes both categorical and numeric columns, with some missing values. .. _`Titanic`: https://www.openml.org/d/40945 Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attributes in X as 0 or 1 to indicate privileged groups; and apply one-hot encoding to any remaining features in X that are categorical and not protected attributes. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ (train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch( "titanic", "classification", astype="pandas", preprocess=preprocess ) orig_X = pd.concat([train_X, test_X]).sort_index() orig_y = pd.concat([train_y, test_y]).sort_index() if preprocess: sex = pd.Series(orig_X["sex_female"] == 1, dtype=np.float64) columns_to_drop = ["sex_female", "sex_male"] # drop more columns that turn into gigantic one-hot encodings otherwise, like name and cabin def extra_categorical_columns_filter(c: str): return ( c.startswith("name") or c.startswith("ticket") or c.startswith("cabin") or c.startswith("home.dest") ) columns_to_drop.extend( [x for x in orig_X.columns if extra_categorical_columns_filter(x)] ) dropped_X = orig_X.drop(labels=columns_to_drop, axis=1) encoded_X = dropped_X.assign(sex=sex) fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "sex", "reference_group": [1]}, ], } return encoded_X, orig_y, fairness_info else: fairness_info = { "favorable_labels": ["1"], "protected_attributes": [ {"feature": "sex", "reference_group": ["female"]}, ], } return orig_X, orig_y, fairness_info
# MEPS HELPERS class _MepsYear(Enum): FY2015 = 15 FY2016 = 16 class _MepsPanel(Enum): PANEL19 = 19 PANEL20 = 20 PANEL21 = 21 def _race(row): if (row["HISPANX"] == 2) and ( row["RACEV2X"] == 1 ): # non-Hispanic Whites are marked as WHITE; all others as NON-WHITE return "White" return "Non-White" def _get_utilization_columns(fiscal_year): return [ f"OBTOTV{fiscal_year.value}", f"OPTOTV{fiscal_year.value}", f"ERTOT{fiscal_year.value}", f"IPNGTD{fiscal_year.value}", f"HHTOTD{fiscal_year.value}", ] def _get_total_utilization(row, fiscal_year): cols = _get_utilization_columns(fiscal_year) return sum((row[x] for x in cols)) def _should_drop_column(x, fiscal_year): utilization_cols = set(_get_utilization_columns(fiscal_year)) return x in utilization_cols def _fetch_meps_raw_df(panel, fiscal_year): filename = "" if fiscal_year == _MepsYear.FY2015: assert panel in [_MepsPanel.PANEL19, _MepsPanel.PANEL20] filename = "h181.csv" elif fiscal_year == _MepsYear.FY2016: assert panel == _MepsPanel.PANEL21 filename = "h192.csv" else: logger.error(f"Unexpected FiscalYear received: {fiscal_year}") raise ValueError(f"Unexpected FiscalYear received: {fiscal_year}") filepath = os.path.join( os.path.dirname(os.path.abspath(aif360.__file__)), "data", "raw", "meps", filename, ) df: typing.Any = None try: df = pd.read_csv(filepath, sep=",", na_values=[]) except IOError as err: logger.error(f"IOError: {err}") logger.error("To use this class, please follow the instructions found here:") logger.error( f"\n\t{'https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps'}\n" ) logger.error( f"\n to download and convert the data and place the final {filename} file, as-is, in the folder:" ) logger.error(f"\n\t{os.path.abspath(os.path.dirname(filepath))}\n") import sys sys.exit(1) df["RACEV2X"] = df.apply(_race, axis=1) df = df.rename(columns={"RACEV2X": "RACE"}) df = df[df["PANEL"] == panel.value] df["TOTEXP15"] = df.apply( lambda row: _get_total_utilization(row, fiscal_year), axis=1 ) lessE = df["TOTEXP15"] < 10.0 df.loc[lessE, "TOTEXP15"] = 0.0 moreE = df["TOTEXP15"] >= 10.0 df.loc[moreE, "TOTEXP15"] = 1.0 df = df.rename(columns={"TOTEXP15": "UTILIZATION"}) columns_to_drop = set( (x for x in df.columns.tolist() if _should_drop_column(x, fiscal_year)) ) df = df[sorted(set(df.columns.tolist()) - columns_to_drop, key=df.columns.get_loc)] X = pd.DataFrame( df, columns=[x for x in df.columns.tolist() if x != "UTILIZATION"] ).sort_index() y = pd.Series(df["UTILIZATION"], name="UTILIZATION").sort_index() fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "RACE", "reference_group": ["White"]}, ], } return X, y, fairness_info def _get_pandas_and_fairness_info_from_meps_dataset(dataset): X, y = lale.lib.aif360.util.dataset_to_pandas(dataset) fairness_info = { "favorable_labels": [1], "protected_attributes": [ {"feature": "RACE", "reference_group": [1]}, ], } return X, y, fairness_info
[docs]def fetch_meps_panel19_fy2015_df(preprocess: bool = False): """ Fetch a subset of the `MEPS`_ dataset from aif360 and add fairness info. It contains information collected on a nationally representative sample of the civilian noninstitutionalized population of the United States, specifically reported medical expenditures and civilian demographics. This dataframe corresponds to data from panel 19 from the year 2015. Without preprocessing, the dataframe contains 16578 rows and 1825 columns. (With preprocessing the dataframe contains 15830 rows and 138 columns.) There is one protected attribute, race, and the disparate impact is 0.496 if preprocessing is not applied and 0.490 if preprocessing is applied. The data includes numeric and categorical columns, with some missing values. Note: in order to use this dataset, be sure to follow the instructions found in the `AIF360 documentation`_ and accept the corresponding license agreement. .. _`MEPS`: https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181 .. _`AIF360 documentation`: https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attribute in X corresponding to race as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate faborable outcomes; rename columns that are panel or round-specific; drop columns such as ID columns that are not relevant to the task at hand; and drop rows where features are unknown. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ if preprocess: dataset = aif360.datasets.MEPSDataset19() return _get_pandas_and_fairness_info_from_meps_dataset(dataset) else: return _fetch_meps_raw_df(_MepsPanel.PANEL19, _MepsYear.FY2015)
[docs]def fetch_meps_panel20_fy2015_df(preprocess: bool = False): """ Fetch a subset of the `MEPS`_ dataset from aif360 and add fairness info. It contains information collected on a nationally representative sample of the civilian noninstitutionalized population of the United States, specifically reported medical expenditures and civilian demographics. This dataframe corresponds to data from panel 20 from the year 2015. Without preprocessing, the dataframe contains 18849 rows and 1825 columns. (With preprocessing the dataframe contains 17570 rows and 138 columns.) There is one protected attribute, race, and the disparate impact is 0.493 if preprocessing is not applied and 0.488 if preprocessing is applied. The data includes numeric and categorical columns, with some missing values. Note: in order to use this dataset, be sure to follow the instructions found in the `AIF360 documentation`_ and accept the corresponding license agreement. .. _`MEPS`: https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181 .. _`AIF360 documentation`: https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attribute in X corresponding to race as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate faborable outcomes; rename columns that are panel or round-specific; drop columns such as ID columns that are not relevant to the task at hand; and drop rows where features are unknown. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ if preprocess: dataset = aif360.datasets.MEPSDataset20() return _get_pandas_and_fairness_info_from_meps_dataset(dataset) else: return _fetch_meps_raw_df(_MepsPanel.PANEL20, _MepsYear.FY2015)
[docs]def fetch_meps_panel21_fy2016_df(preprocess: bool = False): """ Fetch a subset of the `MEPS`_ dataset from aif360 and add fairness info. It contains information collected on a nationally representative sample of the civilian noninstitutionalized population of the United States, specifically reported medical expenditures and civilian demographics. This dataframe corresponds to data from panel 20 from the year 2016. Without preprocessing, the dataframe contains 17052 rows and 1936 columns. (With preprocessing the dataframe contains 15675 rows and 138 columns.) There is one protected attribute, race, and the disparate impact is 0.462 if preprocessing is not applied and 0.451 if preprocessing is applied. The data includes numeric and categorical columns, with some missing values. Note: in order to use this dataset, be sure to follow the instructions found in the `AIF360 documentation`_ and accept the corresponding license agreement. .. _`MEPS`: https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181 .. _`AIF360 documentation`: https://github.com/Trusted-AI/AIF360/tree/master/aif360/data/raw/meps Parameters ---------- preprocess : boolean, optional, default False If True, encode protected attribute in X corresponding to race as 0 or 1 to indicate privileged groups; encode labels in y as 0 or 1 to indicate faborable outcomes; rename columns that are panel or round-specific; drop columns such as ID columns that are not relevant to the task at hand; and drop rows where features are unknown. Returns ------- result : tuple - item 0: pandas Dataframe Features X, including both protected and non-protected attributes. - item 1: pandas Series Labels y. - item 3: fairness_info JSON meta-data following the format understood by fairness metrics and mitigation operators in `lale.lib.aif360`. """ if preprocess: dataset = aif360.datasets.MEPSDataset21() return _get_pandas_and_fairness_info_from_meps_dataset(dataset) else: return _fetch_meps_raw_df(_MepsPanel.PANEL21, _MepsYear.FY2016)