Skip to content

Collections

The collections module provides predefined collections of fairness scenarios, developed to maximize the diversity in how existing fair ML methods perform on them.

Prespecified Collections

fairml_datasets.collections.Corpus

Bases: Collection

The full corpus including all scenarios and datasets.

This collection contains all available datasets and their associated scenarios, providing a comprehensive set for fairness analysis across the entire corpus.

Source code in fairml_datasets/collections.py
class Corpus(Collection):
    """
    The full corpus including all scenarios and datasets.

    This collection contains all available datasets and their associated scenarios,
    providing a comprehensive set for fairness analysis across the entire corpus.
    """

    def __init__(self, inclue_large_datasets=True):
        """
        Initialize the Corpus with all available datasets and scenarios.

        Args:
            inclue_large_datasets: Whether to include datasets marked as 'large'
        """
        all_scenarios = []
        all_datasets = Datasets(inclue_large_datasets=inclue_large_datasets)
        for dataset in all_datasets:
            all_scenarios += generate_dataset_scenarios(dataset=dataset)

        super().__init__(scenarios=all_scenarios)

Functions

__init__(inclue_large_datasets=True)

Initialize the Corpus with all available datasets and scenarios.

Parameters:

Name Type Description Default
inclue_large_datasets

Whether to include datasets marked as 'large'

True
Source code in fairml_datasets/collections.py
def __init__(self, inclue_large_datasets=True):
    """
    Initialize the Corpus with all available datasets and scenarios.

    Args:
        inclue_large_datasets: Whether to include datasets marked as 'large'
    """
    all_scenarios = []
    all_datasets = Datasets(inclue_large_datasets=inclue_large_datasets)
    for dataset in all_datasets:
        all_scenarios += generate_dataset_scenarios(dataset=dataset)

    super().__init__(scenarios=all_scenarios)

fairml_datasets.collections.DecorrelatedSmall

Bases: PrespecifiedCollection

Collection of De-Correlated Datasets with k = 5.

This corresponds to Scenarios described in Table 3 identified with a k.

Source code in fairml_datasets/collections.py
class DecorrelatedSmall(PrespecifiedCollection):
    """
    Collection of De-Correlated Datasets with k = 5.

    This corresponds to Scenarios described in Table 3 identified with a k.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "hmda:applicant_sex_name;applicant_race_name_1",
        "stop_question_and_frisk_data:SUSPECT_SEX;SUSPECT_RACE_DESCRIPTION; SUSPECT_REPORTED_AGE",
        "folktables_acsemployment_small:RAC1P",
    ]

fairml_datasets.collections.DecorrelatedLarge

Bases: PrespecifiedCollection

Collection of De-Correlated Datasets with tau = 0.

This corresponds to Scenarios described in Table 3 identified with a tau.

Source code in fairml_datasets/collections.py
class DecorrelatedLarge(PrespecifiedCollection):
    """
    Collection of De-Correlated Datasets with tau = 0.

    This corresponds to Scenarios described in Table 3 identified with a tau.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "hmda:applicant_sex_name;applicant_race_name_1",
        "stop_question_and_frisk_data:SUSPECT_SEX;SUSPECT_RACE_DESCRIPTION; SUSPECT_REPORTED_AGE",
        "folktables_acsemployment_small:RAC1P",
        "folktables_acstraveltime:RAC1P",
        "compas:sex;age",
        "folktables_acsincome_small:RAC1P",
        "compas_2_years:age",
        "communities_unnormalized:pct12-21",
        "arrhythmia:sex",
        "folktables_acspubliccoverage_small:RAC1P",
        "compas_2_years_violent:age",
        "south_german_credit:age;foreign_worker",
        "dutch:age",
        "folktables_acsmobility_small:RAC1P",
        "law_school_tensorflow:gender",
        "german_credit_onehot:<= 25 years",
        "communities:racePctAsian",
        "nursery:finance",
        "german_credit_numeric:age",
        "chicago_strategic_subject_list:RACE CODE CD",
    ]

fairml_datasets.collections.PermissivelyLicensedSmall

Bases: PrespecifiedCollection

Collection of Permissively Licensed Datasets with k = 5.

This corresponds to Scenarios described in Table 4 identified with a k.

Source code in fairml_datasets/collections.py
class PermissivelyLicensedSmall(PrespecifiedCollection):
    """
    Collection of Permissively Licensed Datasets with k = 5.

    This corresponds to Scenarios described in Table 4 identified with a k.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "communities_unnormalized:pct12-21",
        "lipton_synthetic_hiring_dataset:sex",
        "bank:age;marital",
    ]

fairml_datasets.collections.PermissivelyLicensedLarge

Bases: PrespecifiedCollection

Collection of Permissively Licensed Datasets with tau = 0.

This corresponds to Scenarios described in Table 4 identified with a tau.

Source code in fairml_datasets/collections.py
class PermissivelyLicensedLarge(PrespecifiedCollection):
    """
    Collection of Permissively Licensed Datasets with tau = 0.

    This corresponds to Scenarios described in Table 4 identified with a tau.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "communities_unnormalized:pct12-21",
        "lipton_synthetic_hiring_dataset:sex",
        "bank:age;marital",
        "german_credit_onehot:> 25 years",
        "folktables_acsincome:RAC1P",
        "south_german_credit:age",
        "folktables_acsemployment_small:RAC1P",
        "german_credit_numeric:age",
        "student:sex;age",
        "folktables_acstraveltime_small:RAC1P",
        "folktables_acspubliccoverage_small:RAC1P",
        "communities:agePct16t24",
        "folktables_acsmobility:RAC1P",
        "law_school_tensorflow:gender",
    ]

fairml_datasets.collections.PermissivelyLicensedFull

Bases: PrespecifiedCollection

Full collection of Permissively Licensed Datasets.

This corresponds to all Scenarios described in Table 4.

Source code in fairml_datasets/collections.py
class PermissivelyLicensedFull(PrespecifiedCollection):
    """
    Full collection of Permissively Licensed Datasets.

    This corresponds to all Scenarios described in Table 4.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "communities_unnormalized:pct12-21",
        "lipton_synthetic_hiring_dataset:sex",
        "bank:age;marital",
        "german_credit_onehot:> 25 years",
        "folktables_acsincome:RAC1P",
        "south_german_credit:age",
        "folktables_acsemployment_small:RAC1P",
        "german_credit_numeric:age",
        "student:sex;age",
        "folktables_acstraveltime_small:RAC1P",
        "folktables_acspubliccoverage_small:RAC1P",
        "communities:agePct16t24",
        "folktables_acsmobility:RAC1P",
        "law_school_tensorflow:gender",
        "arrhythmia:sex",
        "adult:race",
        "nursery:finance;parents",
        "folktables_acsincome_small:RAC1P",
        "creditcard:SEX",
        "folktables_acsmobility_small:RAC1P",
        "student_language:age",
        "drug:ethnicity",
        "law_school_lequy:racetxt;male",
        "folktables_acstraveltime:RAC1P",
        "bank_additional_full:age;marital",
        "german_credit:foreign_worker",
        "generate_synthetic_data:s1",
        "bank_additional:age",
        "folktables_acsemployment:RAC1P",
        "bank_full:age",
    ]

fairml_datasets.collections.GeographicSmall

Bases: PrespecifiedCollection

Collection of Geographically Diverse Datasets with k = 5.

This corresponds to Scenarios described in Table 5 identified with a k.

Source code in fairml_datasets/collections.py
class GeographicSmall(PrespecifiedCollection):
    """
    Collection of Geographically Diverse Datasets with k = 5.

    This corresponds to Scenarios described in Table 5 identified with a k.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "dutch:age;citizenship",
        "creditcard:SEX",
        "german_credit_onehot:> 25 years",
    ]

fairml_datasets.collections.GeographicLarge

Bases: PrespecifiedCollection

Collection of Geographically Diverse Datasets with tau = 0.

This corresponds to Scenarios described in Table 5 identified with a tau.

Source code in fairml_datasets/collections.py
class GeographicLarge(PrespecifiedCollection):
    """
    Collection of Geographically Diverse Datasets with tau = 0.

    This corresponds to Scenarios described in Table 5 identified with a tau.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "dutch:age;citizenship",
        "creditcard:SEX",
        "german_credit_onehot:> 25 years",
        "student:sex",
    ]

fairml_datasets.collections.GeographicFull

Bases: PrespecifiedCollection

Full collection of Geographically Diverse Datasets.

This corresponds to all Scenarios described in Table 5.

Source code in fairml_datasets/collections.py
class GeographicFull(PrespecifiedCollection):
    """
    Full collection of Geographically Diverse Datasets.

    This corresponds to all Scenarios described in Table 5.
    """

    scenario_ids = [
        "folktables_acspubliccoverage:RAC1P",
        "heart_disease:sex",
        "dutch:age;citizenship",
        "creditcard:SEX",
        "german_credit_onehot:> 25 years",
        "student:sex",
        "arrhythmia:sex",
        "nursery:finance;parents",
        "synth:sensible_feature",
        "drug:ethnicity",
    ]

Usage Examples

Using the Complete Corpus

from fairml_datasets.collections import Corpus

# Create the corpus (all available datasets and their scenarios)
corpus = Corpus(inclue_large_datasets=True)

# Iterate through all scenarios in the corpus
for scenario in corpus:
    print(f"Dataset: {scenario.dataset_id}")
    print(f"Sensitive columns: {scenario.sensitive_columns}")

    # Load the data
    df = scenario.load(stage="prepared")

Using Predefined Collections

from fairml_datasets.collections import DecorrelatedSmall, PermissivelyLicensedFull, GeographicLarge

# Use a small collection of decorrelated datasets
collection = DecorrelatedSmall()
print(f"Collection contains {len(collection)} scenarios")

# Or use the full collection of permissively licensed datasets
full_collection = PermissivelyLicensedFull()
print(f"Full collection contains {len(full_collection)} scenarios")

# Load and analyze datasets from the geographic collection
geo_collection = GeographicLarge()
for scenario in geo_collection:
    print(f"Dataset: {scenario.dataset_id}")
    print(f"Sensitive columns: {scenario.sensitive_columns}")

    # Load the data
    df = scenario.load(stage="prepared")